CellSPU:

- Ensure that (operation) legalization emits proper FDIV libcall when needed. - Fix various bugs encountered during llvm-spu-gcc build, along with various cleanups. - Start supporting double precision comparisons for remaining libgcc2 build. Discovered interesting DAGCombiner feature, which is currently solved via custom lowering (64-bit constants are not legal on CellSPU, but DAGCombiner insists on inserting one anyway.) - Update README. llvm-svn: 62664
2024-11-24 03:33:20 +01:00 · 2009-01-21 04:58:48 +00:00 · 2009-01-21 04:58:48 +00:00 · c80e71ac35
commit c80e71ac35
parent 074553c4fb
10 changed files with 320 additions and 133 deletions
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@ -3294,6 +3294,10 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
        LC = GetFPLibCall(VT, RTLIB::POW_F32, RTLIB::POW_F64, RTLIB::POW_F80,
                          RTLIB::POW_PPCF128);
        break;
+      case ISD::FDIV:
+        LC = GetFPLibCall(VT, RTLIB::DIV_F32, RTLIB::DIV_F64, RTLIB::DIV_F80,
+                          RTLIB::DIV_PPCF128);
+        break;
      default: break;
      }
      if (LC != RTLIB::UNKNOWN_LIBCALL) {
--- a/lib/Target/CellSPU/README.txt
+++ b/lib/Target/CellSPU/README.txt
@ -8,7 +8,7 @@ Department in The Aerospace Corporation:
 - Mark Thomas (floating point instructions)
 - Michael AuYeung (intrinsics)
 - Chandler Carruth (LLVM expertise)
- Nehal Desai (debugging, RoadRunner SPU expertise)
+- Nehal Desai (debugging, i32 operations, RoadRunner SPU expertise)

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
@ -36,7 +36,7 @@ to add 'spu' to configure's --enable-targets option, e.g.:

 TODO:
 * Create a machine pass for performing dual-pipeline scheduling specifically
-  for CellSPU, handle inserting branch prediction instructions.
+  for CellSPU, and insert branch prediction instructions as needed.

 * i32 instructions:

@ -48,20 +48,43 @@ TODO:
  * sign and zero extension: done
  * addition: done
  * subtraction: needed
-  * multiplication: work-in-progress
+  * multiplication: done

 * i128 support:

-  * zero extension: done
+  * zero extension, any extension: done
  * sign extension: needed
  * arithmetic operators (add, sub, mul, div): needed
+  * logical operations (and, or, shl, srl, sra, xor, nor, nand): needed

-* Double floating point support
+    * or: done

-  This was started. "What's missing?" to be filled in.
+* f64 support
+
+  * Comparison operators:
+    SETOEQ              unimplemented
+    SETOGT              unimplemented
+    SETOGE              unimplemented
+    SETOLT              unimplemented
+    SETOLE              unimplemented
+    SETONE              unimplemented
+    SETO                done (lowered)
+    SETUO               done (lowered)
+    SETUEQ              unimplemented
+    SETUGT              unimplemented
+    SETUGE              unimplemented
+    SETULT              unimplemented
+    SETULE              unimplemented
+    SETUNE              unimplemented
+
+* LLVM vector suport
+
+  * VSETCC needs to be implemented. It's pretty straightforward to code, but
+    needs implementation.

 * Intrinsics

-  Lots of progress. "What's missing/incomplete?" to be filled in.
+  * spu.h instrinsics added but not tested. Need to have an operational
+    llvm-spu-gcc in order to write a unit test harness.

 ===-------------------------------------------------------------------------===
--- a/lib/Target/CellSPU/SPU128InstrInfo.td
+++ b/lib/Target/CellSPU/SPU128InstrInfo.td
@ -2,7 +2,6 @@
 //
 //                     Cell SPU 128-bit operations
 //
-// Primary author: Scott Michel (scottm@aero.org)
 //===----------------------------------------------------------------------===//
                                  
 // zext 32->128: Zero extend 32-bit to 128-bit
@ -20,3 +19,23 @@ def : Pat<(i128 (zext R16C:$rSrc)),
 // zext 8->128: Zero extend 8-bit to 128-bit
 def : Pat<(i128 (zext R8C:$rSrc)),
          (ROTQMBYIr128_zext_r32 (ANDIi8i32 R8C:$rSrc, 0xf), 12)>;
+
+// anyext 32->128: Zero extend 32-bit to 128-bit
+def : Pat<(i128 (anyext R32C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 R32C:$rSrc, 12)>;
+
+// anyext 64->128: Zero extend 64-bit to 128-bit
+def : Pat<(i128 (anyext R64C:$rSrc)),
+          (ROTQMBYIr128_zext_r64 R64C:$rSrc, 8)>;
+
+// anyext 16->128: Zero extend 16-bit to 128-bit
+def : Pat<(i128 (anyext R16C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDi16i32 R16C:$rSrc, (ILAr32 0xffff)), 12)>;
+
+// anyext 8->128: Zero extend 8-bit to 128-bit
+def : Pat<(i128 (anyext R8C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDIi8i32 R8C:$rSrc, 0xf), 12)>;
+
+// Shift left
+def : Pat<(shl GPRC:$rA, R32C:$rB),
+          (SHLQBYBIr128 (SHLQBIr128 GPRC:$rA, R32C:$rB), R32C:$rB)>;
--- a/lib/Target/CellSPU/SPU64InstrInfo.td
+++ b/lib/Target/CellSPU/SPU64InstrInfo.td
@ -33,6 +33,13 @@ def SELBr64_cond:
   SELBInst<(outs R64C:$rT), (ins R64C:$rA, R64C:$rB, VECREG:$rC),
            [/* no pattern */]>;

+// The generic i64 select pattern, which assumes that the comparison result
+// is in a 32-bit register that contains a select mask pattern (i.e., gather
+// bits result):
+
+def : Pat<(select R32C:$rCond, R64C:$rFalse, R64C:$rTrue),
+          (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 R32C:$rCond))>;
+
 // select the negative condition:
 class I64SELECTNegCond<PatFrag cond, CodeFrag compare>:
  Pat<(select (i32 (cond R64C:$rA, R64C:$rB)), R64C:$rTrue, R64C:$rFalse),
@ -43,13 +50,6 @@ class I64SETCCNegCond<PatFrag cond, CodeFrag compare>:
  Pat<(cond R64C:$rA, R64C:$rB),
      (XORIr32 compare.Fragment, -1)>;

-// The generic i64 select pattern, which assumes that the comparison result
-// is in a 32-bit register that contains a select mask pattern (i.e., gather
-// bits result):
-
-def : Pat<(select R32C:$rCond, R64C:$rFalse, R64C:$rTrue),
-          (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 R32C:$rCond))>;
-
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 // The i64 seteq fragment that does the scalar->vector conversion and
 // comparison:
@ -381,3 +381,14 @@ def : Pat<(SPUmul64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
                    (v4i32 VECREG:$rCGmask)),
          v2i64_mul<(v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
                    (v4i32 VECREG:$rCGmask)>.Fragment>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// f64 comparisons
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// selb instruction definition for i64. Note that the selection mask is
+// a vector, produced by various forms of FSM:
+def SELBf64_cond:
+   SELBInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB, R32C:$rC),
+            [(set R64FP:$rT,
+                  (select R32C:$rC, R64FP:$rB, R64FP:$rA))]>;
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@ -742,6 +742,27 @@ SPUDAGToDAGISel::Select(SDValue Op) {
    SDNode *CGLoad =
            emitBuildVector(SPU::getCarryGenerateShufMask(*CurDAG));

+    return SelectCode(CurDAG->getNode(SPUISD::MUL64_MARKER, OpVT,
+                                      Op.getOperand(0), Op.getOperand(1),
+                                      SDValue(CGLoad, 0)));
+  } else if (Opc == ISD::ADD && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(SPU::getCarryGenerateShufMask(*CurDAG));
+
+    return SelectCode(CurDAG->getNode(SPUISD::ADD64_MARKER, OpVT,
+                                      Op.getOperand(0), Op.getOperand(1),
+                                      SDValue(CGLoad, 0)));
+  } else if (Opc == ISD::SUB && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(SPU::getBorrowGenerateShufMask(*CurDAG));
+
+    return SelectCode(CurDAG->getNode(SPUISD::SUB64_MARKER, OpVT,
+                                      Op.getOperand(0), Op.getOperand(1),
+                                      SDValue(CGLoad, 0)));
+  } else if (Opc == ISD::MUL && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(SPU::getCarryGenerateShufMask(*CurDAG));
+
    return SelectCode(CurDAG->getNode(SPUISD::MUL64_MARKER, OpVT,
                                      Op.getOperand(0), Op.getOperand(1),
                                      SDValue(CGLoad, 0)));
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@ -92,6 +92,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  setUseUnderscoreSetJmp(true);
  setUseUnderscoreLongJmp(true);

+  // Set RTLIB libcall names as used by SPU:
+  setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
+
  // Set up the SPU's register classes:
  addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
  addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
@ -183,6 +186,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);

+  // Make sure that DAGCombine doesn't insert illegal 64-bit constants
+  setOperationAction(ISD::FABS,  MVT::f64, Custom);
+
  // SPU can do rotate right and left, so legalize it... but customize for i8
  // because instructions don't exist.

@ -243,6 +249,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  setOperationAction(ISD::SETCC, MVT::i16,   Legal);
  setOperationAction(ISD::SETCC, MVT::i32,   Legal);
  setOperationAction(ISD::SETCC, MVT::i64,   Legal);
+  setOperationAction(ISD::SETCC, MVT::f64,   Custom);

  // Custom lower i128 -> i64 truncates
  setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
@ -410,6 +417,9 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
    node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA";
    node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
    node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
+    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
+    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
+            "SPUISD::ROTBYTES_LEFT_BITS";
    node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
    node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
    node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
@ -1552,12 +1562,9 @@ static bool isConstantSplat(const uint64_t Bits128[2],
  return false;  // Can't be a splat if two pieces don't match.
 }

-// If this is a case we can't handle, return null and let the default
-// expansion code take care of it.  If we CAN select this case, and if it
-// selects to a single instruction, return Op.  Otherwise, if we can codegen
-// this case more efficiently than a constant pool load, lower it to the
-// sequence of ops that should be used.
-static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+//! Lower a BUILD_VECTOR instruction creatively:
+SDValue
+SPU::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
  MVT VT = Op.getValueType();
  // If this is a vector of constants or undefs, get the bits.  A bit in
  // UndefBits is set if the corresponding element of the vector is an
@ -1575,6 +1582,11 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {

  switch (VT.getSimpleVT()) {
  default:
+    cerr << "CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = "
+         << VT.getMVTString()
+         << "\n";
+    abort();
+    /*NOTREACHED*/
  case MVT::v4f32: {
    uint32_t Value32 = SplatBits;
    assert(SplatSize == 4
@ -2188,31 +2200,31 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,

 //! Generate the carry-generate shuffle mask.
 SDValue SPU::getCarryGenerateShufMask(SelectionDAG &DAG) {
-SmallVector<SDValue, 16> ShufBytes;
+  SmallVector<SDValue, 16 > ShufBytes;

-// Create the shuffle mask for "rotating" the borrow up one register slot
-// once the borrow is generated.
-ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
-ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
-ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
-ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
+  // Create the shuffle mask for "rotating" the borrow up one register slot
+  // once the borrow is generated.
+  ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+  ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
+  ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+  ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));

-return DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+  return DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
                     &ShufBytes[0], ShufBytes.size());
 }

 //! Generate the borrow-generate shuffle mask
 SDValue SPU::getBorrowGenerateShufMask(SelectionDAG &DAG) {
-SmallVector<SDValue, 16> ShufBytes;
+  SmallVector<SDValue, 16 > ShufBytes;

-// Create the shuffle mask for "rotating" the borrow up one register slot
-// once the borrow is generated.
-ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
-ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
-ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
-ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+  // Create the shuffle mask for "rotating" the borrow up one register slot
+  // once the borrow is generated.
+  ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+  ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+  ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+  ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));

-return DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+  return DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
                     &ShufBytes[0], ShufBytes.size());
 }

@ -2372,6 +2384,83 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
  return SDValue();
 }

+//! Lower ISD::FABS
+/*!
+ DAGCombine does the same basic reduction: convert the double to i64 and mask
+ off the sign bit. Unfortunately, DAGCombine inserts the i64 constant, which
+ CellSPU has to legalize. Hence, the custom lowering.
+ */
+
+static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) {
+  MVT OpVT = Op.getValueType();
+  MVT IntVT(MVT::i64);
+  SDValue Op0 = Op.getOperand(0);
+
+  assert(OpVT == MVT::f64 && "LowerFABS: expecting MVT::f64!\n");
+
+  SDValue iABS =
+          DAG.getNode(ISD::AND, IntVT,
+                      DAG.getNode(ISD::BIT_CONVERT, IntVT, Op0),
+                      DAG.getConstant(~IntVT.getIntegerVTSignBit(), IntVT));
+
+  return DAG.getNode(ISD::BIT_CONVERT, MVT::f64, iABS);
+}
+
+//! Lower ISD::SETCC
+/*!
+ This handles MVT::f64 (double floating point) condition lowering
+ */
+
+static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
+                          const TargetLowering &TLI) {
+  SDValue lhs = Op.getOperand(0);
+  SDValue rhs = Op.getOperand(1);
+  CondCodeSDNode *CC = dyn_cast<CondCodeSDNode > (Op.getOperand(2));
+  MVT lhsVT = lhs.getValueType();
+  SDValue posNaN = DAG.getConstant(0x7ff0000000000001ULL, MVT::i64);
+
+  assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
+  assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
+
+  switch (CC->get()) {
+  case ISD::SETOEQ:
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETONE:
+    cerr << "CellSPU ISel Select: unimplemented f64 condition\n";
+    abort();
+    break;
+  case ISD::SETO: {
+    SDValue lhsfabs = DAG.getNode(ISD::FABS, MVT::f64, lhs);
+    SDValue i64lhs =
+            DAG.getNode(ISD::BIT_CONVERT, MVT::i64, lhsfabs);
+
+    return DAG.getSetCC(MVT::i32, i64lhs, posNaN, ISD::SETLT);
+  }
+  case ISD::SETUO: {
+    SDValue lhsfabs = DAG.getNode(ISD::FABS, MVT::f64, lhs);
+    SDValue i64lhs =
+            DAG.getNode(ISD::BIT_CONVERT, MVT::i64, lhsfabs);
+
+    return DAG.getSetCC(MVT::i32, i64lhs, posNaN, ISD::SETGE);
+  }
+  case ISD::SETUEQ:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUNE:
+  default:
+    cerr << "CellSPU ISel Select: unimplemented f64 condition\n";
+    abort();
+    break;
+  }
+
+  return SDValue();
+}
+
 //! Lower ISD::SELECT_CC
 /*!
  ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
@ -2501,9 +2590,12 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
    break;
  }

+  case ISD::FABS:
+    return LowerFABS(Op, DAG);
+
  // Vector-related lowering.
  case ISD::BUILD_VECTOR:
-    return LowerBUILD_VECTOR(Op, DAG);
+    return SPU::LowerBUILD_VECTOR(Op, DAG);
  case ISD::SCALAR_TO_VECTOR:
    return LowerSCALAR_TO_VECTOR(Op, DAG);
  case ISD::VECTOR_SHUFFLE:
@ -2530,6 +2622,9 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
  case ISD::SELECT_CC:
    return LowerSELECT_CC(Op, DAG, *this);

+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG, *this);
+
  case ISD::TRUNCATE:
    return LowerTRUNCATE(Op, DAG);
  }
@ -2656,8 +2751,8 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
  }
  case SPUISD::IndirectAddr: {
    if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
-      ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(1));
-      if (CN->getZExtValue() == 0) {
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
+      if (CN != 0 && CN->getZExtValue() == 0) {
        // (SPUindirect (SPUaform <addr>, 0), 0) ->
        // (SPUaform <addr>, 0)

@ -2809,41 +2904,18 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
                                                  unsigned Depth ) const {
 #if 0
  const uint64_t uint64_sizebits = sizeof(uint64_t) * 8;
-#endif

  switch (Op.getOpcode()) {
  default:
    // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
    break;
-
-#if 0
  case CALL:
  case SHUFB:
  case SHUFFLE_MASK:
  case CNTB:
-#endif
-
-  case SPUISD::PREFSLOT2VEC: {
-    SDValue Op0 = Op.getOperand(0);
-    MVT Op0VT = Op0.getValueType();
-    unsigned Op0VTBits = Op0VT.getSizeInBits();
-    uint64_t InMask = Op0VT.getIntegerVTBitMask();
-    KnownZero |= APInt(Op0VTBits, ~InMask, false);
-    KnownOne |= APInt(Op0VTBits, InMask, false);
-    break;
-  }
-
+  case SPUISD::PREFSLOT2VEC:
  case SPUISD::LDRESULT:
-  case SPUISD::VEC2PREFSLOT: {
-    MVT OpVT = Op.getValueType();
-    unsigned OpVTBits = OpVT.getSizeInBits();
-    uint64_t InMask = OpVT.getIntegerVTBitMask();
-    KnownZero |= APInt(OpVTBits, ~InMask, false);
-    KnownOne |= APInt(OpVTBits, InMask, false);
-    break;
-  }
-
-#if 0
+  case SPUISD::VEC2PREFSLOT:
  case SPUISD::SHLQUAD_L_BITS:
  case SPUISD::SHLQUAD_L_BYTES:
  case SPUISD::VEC_SHL:
@ -2854,8 +2926,8 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
  case SPUISD::ROTBYTES_LEFT:
  case SPUISD::SELECT_MASK:
  case SPUISD::SELB:
-#endif
  }
+#endif
 }

 unsigned
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@ -61,7 +61,7 @@ namespace llvm {
    };
  }

-  /// Predicates that are used for node matching:
+  //! Utility functions specific to CellSPU-only:
  namespace SPU {
    SDValue get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
                             MVT ValueType);
@ -78,6 +78,7 @@ namespace llvm {

    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG,
                              const SPUTargetMachine &TM);
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG);

    SDValue getBorrowGenerateShufMask(SelectionDAG &DAG);
    SDValue getCarryGenerateShufMask(SelectionDAG &DAG);
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@ -134,6 +134,7 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
  case SPU::ORi64_v2i64:
  case SPU::ORf32_v4f32:
  case SPU::ORf64_v2f64:
+/*
  case SPU::ORi128_r64:
  case SPU::ORi128_f64:
  case SPU::ORi128_r32:
@ -148,6 +149,8 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
  case SPU::ORr16_i128:
  case SPU::ORr8_i128:
  case SPU::ORvec_i128:
+*/
+/*
  case SPU::ORr16_r32:
  case SPU::ORr8_r32:
  case SPU::ORr32_r16:
@ -158,7 +161,11 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
  case SPU::ORr64_r32:
  case SPU::ORr64_r16:
  case SPU::ORr64_r8:
-  {
+*/
+  case SPU::ORf32_r32:
+  case SPU::ORr32_f32:
+  case SPU::ORf64_r64:
+  case SPU::ORr64_f64: {
    assert(MI.getNumOperands() == 2 &&
           MI.getOperand(0).isReg() &&
           MI.getOperand(1).isReg() &&
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@ -1259,9 +1259,6 @@ multiclass BitwiseAnd
  def fabs32: ANDInst<(outs R32FP:$rT), (ins R32FP:$rA, R32C:$rB),
                      [/* Intentionally does not match a pattern */]>;

-  def fabs64: ANDInst<(outs R64FP:$rT), (ins R64FP:$rA, VECREG:$rB),
-                      [/* Intentionally does not match a pattern */]>;
-
  // Could use v4i32, but won't for clarity
  def fabsvec: ANDInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
                       [/* Intentionally does not match a pattern */]>;
@ -1408,12 +1405,12 @@ class ORRegInst<RegisterClass rclass>:
 // These are effectively no-ops, but need to exist for proper type conversion
 // and type coercion.

-class ORCvtForm<dag OOL, dag IOL>
+class ORCvtForm<dag OOL, dag IOL, list<dag> pattern = [/* no pattern */]>
          : SPUInstr<OOL, IOL, "or\t$rT, $rA, $rA", IntegerOp> {
  bits<7> RA;
  bits<7> RT;

-  let Pattern = [/* no pattern */];
+  let Pattern = pattern;

  let Inst{0-10} = 0b10000010000;
  let Inst{11-17} = RA;
@ -1427,29 +1424,29 @@ class ORPromoteScalar<RegisterClass rclass>:
 class ORExtractElt<RegisterClass rclass>:
    ORCvtForm<(outs rclass:$rT), (ins VECREG:$rA)>;

-class ORCvtRegGPRC<RegisterClass rclass>:
-    ORCvtForm<(outs GPRC:$rT), (ins rclass:$rA)>;
+/* class ORCvtRegGPRC<RegisterClass rclass>:
+    ORCvtForm<(outs GPRC:$rT), (ins rclass:$rA)>; */

-class ORCvtVecGPRC:
-    ORCvtForm<(outs GPRC:$rT), (ins VECREG:$rA)>;
+/* class ORCvtVecGPRC:
+    ORCvtForm<(outs GPRC:$rT), (ins VECREG:$rA)>; */

-class ORCvtGPRCReg<RegisterClass rclass>:
-    ORCvtForm<(outs rclass:$rT), (ins GPRC:$rA)>;
+/* class ORCvtGPRCReg<RegisterClass rclass>:
+    ORCvtForm<(outs rclass:$rT), (ins GPRC:$rA)>; */
    
-class ORCvtFormR32Reg<RegisterClass rclass>:
-    ORCvtForm<(outs rclass:$rT), (ins R32C:$rA)>;
+class ORCvtFormR32Reg<RegisterClass rclass, list<dag> pattern = [ ]>:
+    ORCvtForm<(outs rclass:$rT), (ins R32C:$rA), pattern>;
    
-class ORCvtFormRegR32<RegisterClass rclass>:
-    ORCvtForm<(outs R32C:$rT), (ins rclass:$rA)>;
+class ORCvtFormRegR32<RegisterClass rclass, list<dag> pattern = [ ]>:
+    ORCvtForm<(outs R32C:$rT), (ins rclass:$rA), pattern>;

-class ORCvtFormR64Reg<RegisterClass rclass>:
-    ORCvtForm<(outs rclass:$rT), (ins R64C:$rA)>;
+class ORCvtFormR64Reg<RegisterClass rclass, list<dag> pattern = [ ]>:
+    ORCvtForm<(outs rclass:$rT), (ins R64C:$rA), pattern>;
    
-class ORCvtFormRegR64<RegisterClass rclass>:
-    ORCvtForm<(outs R64C:$rT), (ins rclass:$rA)>;
+class ORCvtFormRegR64<RegisterClass rclass, list<dag> pattern = [ ]>:
+    ORCvtForm<(outs R64C:$rT), (ins rclass:$rA), pattern>;

-class ORCvtGPRCVec:
-    ORCvtForm<(outs VECREG:$rT), (ins GPRC:$rA)>;
+/* class ORCvtGPRCVec:
+    ORCvtForm<(outs VECREG:$rT), (ins GPRC:$rA)>; */

 multiclass BitwiseOr
 {
@ -1468,6 +1465,7 @@ multiclass BitwiseOr
                          (v2f64 (bitconvert (or (v2i64 VECREG:$rA),
                                                 (v2i64 VECREG:$rB)))))]>;

+  def r128: ORRegInst<GPRC>;
  def r64:  ORRegInst<R64C>;
  def r32:  ORRegInst<R32C>;
  def r16:  ORRegInst<R16C>;
@ -1496,6 +1494,7 @@ multiclass BitwiseOr
  def f32_v4f32: ORExtractElt<R32FP>;
  def f64_v2f64: ORExtractElt<R64FP>;

+/*
  // Conversion from GPRC to register
  def i128_r64:  ORCvtRegGPRC<R64C>;
  def i128_f64:  ORCvtRegGPRC<R64FP>;
@ -1517,7 +1516,8 @@ multiclass BitwiseOr

  // Conversion from vector to GPRC
  def vec_i128:  ORCvtGPRCVec;
-  
+*/
+/*
  // Conversion from register to R32C:
  def r16_r32:   ORCvtFormRegR32<R16C>;
  def r8_r32:    ORCvtFormRegR32<R8C>;
@ -1535,6 +1535,18 @@ multiclass BitwiseOr
  def r64_r32:   ORCvtFormRegR64<R32C>;
  def r64_r16:   ORCvtFormRegR64<R16C>;
  def r64_r8:    ORCvtFormRegR64<R8C>;
+*/
+
+  // bitconvert patterns:
+  def r32_f32:   ORCvtFormR32Reg<R32FP,
+                                 [(set R32FP:$rT, (bitconvert R32C:$rA))]>;
+  def f32_r32:   ORCvtFormRegR32<R32FP,
+                                 [(set R32C:$rT, (bitconvert R32FP:$rA))]>;
+
+  def r64_f64:   ORCvtFormR64Reg<R64FP,
+                                 [(set R64FP:$rT, (bitconvert R64C:$rA))]>;
+  def f64_r64:   ORCvtFormRegR64<R64FP,
+                                 [(set R64C:$rT, (bitconvert R64FP:$rA))]>;
 }

 defm OR : BitwiseOr;
@ -1960,7 +1972,7 @@ multiclass SelectBits
 			       (v4f32 VECREG:$rB),
 			       (v4f32 VECREG:$rA)))]>;

-  // SELBr64_cond is defined further down, look for i64 comparisons
+  // SELBr64_cond is defined in SPU64InstrInfo.td
  def r32_cond:   SELBRegCondInst<R32C, R32C>;
  def f32_cond:   SELBRegCondInst<R32C, R32FP>;
  def r16_cond:   SELBRegCondInst<R16C, R16C>;
@ -2146,14 +2158,6 @@ class SHLHVecInst<ValueType vectype>:
             [(set (vectype VECREG:$rT),
                   (SPUvec_shl (vectype VECREG:$rA), R16C:$rB))]>;

-// $rB gets promoted to 32-bit register type when confronted with
-// this llvm assembly code:
-//
-// define i16 @shlh_i16_1(i16 %arg1, i16 %arg2) {
-//      %A = shl i16 %arg1, %arg2
-//      ret i16 %A
-// }
-
 multiclass ShiftLeftHalfword
 {
  def v8i16: SHLHVecInst<v8i16>;
@ -2250,6 +2254,10 @@ class SHLQBIVecInst<ValueType vectype>:
               [(set (vectype VECREG:$rT),
                     (SPUshlquad_l_bits (vectype VECREG:$rA), R32C:$rB))]>;

+class SHLQBIRegInst<RegisterClass rclass>:
+    SHLQBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+               [/* no pattern */]>;
+
 multiclass ShiftLeftQuadByBits
 {
  def v16i8: SHLQBIVecInst<v16i8>;
@ -2258,6 +2266,8 @@ multiclass ShiftLeftQuadByBits
  def v4f32: SHLQBIVecInst<v4f32>;
  def v2i64: SHLQBIVecInst<v2i64>;
  def v2f64: SHLQBIVecInst<v2f64>;
+
+  def r128:  SHLQBIRegInst<GPRC>;
 }

 defm SHLQBI : ShiftLeftQuadByBits;
@ -2335,6 +2345,32 @@ multiclass ShiftLeftQuadBytesImm

 defm SHLQBYI : ShiftLeftQuadBytesImm;

+class SHLQBYBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00111001111, OOL, IOL, "shlqbybi\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class SHLQBYBIVecInst<ValueType vectype>:
+    SHLQBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+class SHLQBYBIRegInst<RegisterClass rclass>:
+    SHLQBYBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+                 [/* no pattern */]>;
+
+multiclass ShiftLeftQuadBytesBitCount
+{
+  def v16i8: SHLQBYBIVecInst<v16i8>;
+  def v8i16: SHLQBYBIVecInst<v8i16>;
+  def v4i32: SHLQBYBIVecInst<v4i32>;
+  def v4f32: SHLQBYBIVecInst<v4f32>;
+  def v2i64: SHLQBYBIVecInst<v2i64>;
+  def v2f64: SHLQBYBIVecInst<v2f64>;
+
+  def r128:  SHLQBYBIRegInst<GPRC>;
+}
+
+defm SHLQBYBI : ShiftLeftQuadBytesBitCount;
+
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 // Rotate halfword:
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
@ -4285,13 +4321,6 @@ def : Pat<(fabs (v4f32 VECREG:$rA)),
          (ANDfabsvec (v4f32 VECREG:$rA),
                      (v4f32 (ANDBIv16i8 (FSMBIv16i8 0xffff), 0x7f)))>;

-def : Pat<(fabs R64FP:$rA),
-          (ANDfabs64 R64FP:$rA, (ANDBIv16i8 (FSMBIv16i8 0xffff), 0x7f))>;
-
-def : Pat<(fabs (v2f64 VECREG:$rA)),
-          (ANDfabsvec (v2f64 VECREG:$rA),
-                      (v2f64 (ANDBIv16i8 (FSMBIv16i8 0xffff), 0x7f)))>;
-
 //===----------------------------------------------------------------------===//
 // Hint for branch instructions:
 //===----------------------------------------------------------------------===//
--- a/test/CodeGen/CellSPU/fneg-fabs.ll
+++ b/test/CodeGen/CellSPU/fneg-fabs.ll
@ -1,9 +1,9 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
-; RUN: grep fsmbi   %t1.s | count 3
+; RUN: grep fsmbi   %t1.s | count 2
 ; RUN: grep 32768   %t1.s | count 2
 ; RUN: grep xor     %t1.s | count 4
-; RUN: grep and     %t1.s | count 5
-; RUN: grep andbi   %t1.s | count 3
+; RUN: grep and     %t1.s | count 4
+; RUN: grep andbi   %t1.s | count 2
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
 target triple = "spu"