[SystemZ] computeKnownBitsForTargetNode() / ComputeNumSignBitsForTargetNode()

Improve/implement these methods to improve DAG combining. This mainly concerns intrinsics. Some constant operands to SystemZISD nodes have been marked Opaque to avoid transforming back and forth between generic and target nodes infinitely. Review: Ulrich Weigand llvm-svn: 327765
2024-11-22 18:54:02 +01:00 · 2018-03-17 08:32:12 +00:00 · 2018-03-17 08:32:12 +00:00 · 3203edf16f
commit 3203edf16f
parent cebc088508
9 changed files with 1560 additions and 21 deletions
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@ -4185,12 +4185,15 @@ static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
                                       const SDLoc &DL, EVT VT, uint64_t Value,
                                       unsigned BitsPerElement) {
  // Signed 16-bit values can be replicated using VREPI.
+  // Mark the constants as opaque or DAGCombiner will convert back to
+  // BUILD_VECTOR.
  int64_t SignedValue = SignExtend64(Value, BitsPerElement);
  if (isInt<16>(SignedValue)) {
    MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
                                 SystemZ::VectorBits / BitsPerElement);
-    SDValue Op = DAG.getNode(SystemZISD::REPLICATE, DL, VecVT,
-                             DAG.getConstant(SignedValue, DL, MVT::i32));
+    SDValue Op = DAG.getNode(
+        SystemZISD::REPLICATE, DL, VecVT,
+        DAG.getConstant(SignedValue, DL, MVT::i32, false, true /*isOpaque*/));
    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
  }
  // See whether rotating the constant left some N places gives a value that
@ -4206,9 +4209,10 @@ static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
    End -= 64 - BitsPerElement;
    MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
                                 SystemZ::VectorBits / BitsPerElement);
-    SDValue Op = DAG.getNode(SystemZISD::ROTATE_MASK, DL, VecVT,
-                             DAG.getConstant(Start, DL, MVT::i32),
-                             DAG.getConstant(End, DL, MVT::i32));
+    SDValue Op = DAG.getNode(
+        SystemZISD::ROTATE_MASK, DL, VecVT,
+        DAG.getConstant(Start, DL, MVT::i32, false, true /*isOpaque*/),
+        DAG.getConstant(End, DL, MVT::i32, false, true /*isOpaque*/));
    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
  }
  return SDValue();
@ -4421,8 +4425,9 @@ SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
    // priority over other methods below.
    uint64_t Mask = 0;
    if (tryBuildVectorByteMask(BVN, Mask)) {
-      SDValue Op = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
-                               DAG.getConstant(Mask, DL, MVT::i32));
+      SDValue Op = DAG.getNode(
+          SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+          DAG.getConstant(Mask, DL, MVT::i32, false, true /*isOpaque*/));
      return DAG.getNode(ISD::BITCAST, DL, VT, Op);
    }

@ -5605,28 +5610,293 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
  return SDValue();
 }

+// Return the demanded elements for the OpNo source operand of Op. DemandedElts
+// are for Op.
+static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
+                                    unsigned OpNo) {
+  EVT VT = Op.getValueType();
+  unsigned NumElts = (VT.isVector() ? VT.getVectorNumElements() : 1);
+  APInt SrcDemE;
+  unsigned Opcode = Op.getOpcode();
+  if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (Id) {
+    case Intrinsic::s390_vpksh:   // PACKS
+    case Intrinsic::s390_vpksf:
+    case Intrinsic::s390_vpksg:
+    case Intrinsic::s390_vpkshs:  // PACKS_CC
+    case Intrinsic::s390_vpksfs:
+    case Intrinsic::s390_vpksgs:
+    case Intrinsic::s390_vpklsh:  // PACKLS
+    case Intrinsic::s390_vpklsf:
+    case Intrinsic::s390_vpklsg:
+    case Intrinsic::s390_vpklshs: // PACKLS_CC
+    case Intrinsic::s390_vpklsfs:
+    case Intrinsic::s390_vpklsgs:
+      // VECTOR PACK truncates the elements of two source vectors into one.
+      SrcDemE = DemandedElts;
+      if (OpNo == 2)
+        SrcDemE.lshrInPlace(NumElts / 2);
+      SrcDemE = SrcDemE.trunc(NumElts / 2);
+      break;
+      // VECTOR UNPACK extends half the elements of the source vector.
+    case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
+    case Intrinsic::s390_vuphh:
+    case Intrinsic::s390_vuphf:
+    case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
+    case Intrinsic::s390_vuplhh:
+    case Intrinsic::s390_vuplhf:
+      SrcDemE = APInt(NumElts * 2, 0);
+      SrcDemE.insertBits(DemandedElts, 0);
+      break;
+    case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
+    case Intrinsic::s390_vuplhw:
+    case Intrinsic::s390_vuplf:
+    case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
+    case Intrinsic::s390_vupllh:
+    case Intrinsic::s390_vupllf:
+      SrcDemE = APInt(NumElts * 2, 0);
+      SrcDemE.insertBits(DemandedElts, NumElts);
+      break;
+    case Intrinsic::s390_vpdi: {
+      // VECTOR PERMUTE DWORD IMMEDIATE selects one element from each source.
+      SrcDemE = APInt(NumElts, 0);
+      if (!DemandedElts[OpNo - 1])
+        break;
+      unsigned Mask = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      unsigned MaskBit = ((OpNo - 1) ? 1 : 4);
+      // Demand input element 0 or 1, given by the mask bit value.
+      SrcDemE.setBit((Mask & MaskBit)? 1 : 0);
+      break;
+    }
+    case Intrinsic::s390_vsldb: {
+      // VECTOR SHIFT LEFT DOUBLE BY BYTE
+      assert(VT == MVT::v16i8 && "Unexpected type.");
+      unsigned FirstIdx = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      assert (FirstIdx > 0 && FirstIdx < 16 && "Unused operand.");
+      unsigned NumSrc0Els = 16 - FirstIdx;
+      SrcDemE = APInt(NumElts, 0);
+      if (OpNo == 1) {
+        APInt DemEls = DemandedElts.trunc(NumSrc0Els);
+        SrcDemE.insertBits(DemEls, FirstIdx);
+      } else {
+        APInt DemEls = DemandedElts.lshr(NumSrc0Els);
+        SrcDemE.insertBits(DemEls, 0);
+      }
+      break;
+    }
+    case Intrinsic::s390_vperm:
+      SrcDemE = APInt(NumElts, 1);
+      break;
+    default:
+      llvm_unreachable("Unhandled intrinsic.");
+      break;
+    }
+  } else {
+    switch (Opcode) {
+    case SystemZISD::JOIN_DWORDS:
+      // Scalar operand.
+      SrcDemE = APInt(1, 1);
+      break;
+    case SystemZISD::SELECT_CCMASK:
+      SrcDemE = DemandedElts;
+      break;
+    default:
+      llvm_unreachable("Unhandled opcode.");
+      break;
+    }
+  }
+  return SrcDemE;
+}
+
+static void computeKnownBitsBinOp(const SDValue Op, KnownBits &Known,
+                                  const APInt &DemandedElts,
+                                  const SelectionDAG &DAG, unsigned Depth,
+                                  unsigned OpNo) {
+  APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
+  APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
+  unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
+  KnownBits LHSKnown(SrcBitWidth), RHSKnown(SrcBitWidth);
+  DAG.computeKnownBits(Op.getOperand(OpNo), LHSKnown, Src0DemE, Depth + 1);
+  DAG.computeKnownBits(Op.getOperand(OpNo + 1), RHSKnown, Src1DemE, Depth + 1);
+  Known.Zero = LHSKnown.Zero & RHSKnown.Zero;
+  Known.One = LHSKnown.One & RHSKnown.One;
+}
+
 void
 SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                     KnownBits &Known,
                                                     const APInt &DemandedElts,
                                                     const SelectionDAG &DAG,
                                                     unsigned Depth) const {
-  unsigned BitWidth = Known.getBitWidth();
-
  Known.resetAll();
-  switch (Op.getOpcode()) {
-  case SystemZISD::SELECT_CCMASK: {
-    KnownBits TrueKnown(BitWidth), FalseKnown(BitWidth);
-    DAG.computeKnownBits(Op.getOperand(0), TrueKnown, Depth + 1);
-    DAG.computeKnownBits(Op.getOperand(1), FalseKnown, Depth + 1);
-    Known.Zero = TrueKnown.Zero & FalseKnown.Zero;
-    Known.One = TrueKnown.One & FalseKnown.One;
-    break;
+
+  // Intrinsic CC result is returned in the two low bits.
+  unsigned tmp0, tmp1; // not used
+  if (Op.getResNo() == 1 && isIntrinsicWithCC(Op, tmp0, tmp1)) {
+    Known.Zero.setBitsFrom(2);
+    return;
+  }
+  EVT VT = Op.getValueType();
+  if (Op.getResNo() != 0 || VT == MVT::Untyped)
+    return;
+  assert (Known.getBitWidth() == VT.getScalarSizeInBits() &&
+          "KnownBits does not match VT in bitwidth");
+  assert ((!VT.isVector() ||
+           (DemandedElts.getBitWidth() == VT.getVectorNumElements())) &&
+          "DemandedElts does not match VT number of elements");
+  unsigned BitWidth = Known.getBitWidth();
+  unsigned Opcode = Op.getOpcode();
+  if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+    bool IsLogical = false;
+    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (Id) {
+    case Intrinsic::s390_vpksh:   // PACKS
+    case Intrinsic::s390_vpksf:
+    case Intrinsic::s390_vpksg:
+    case Intrinsic::s390_vpkshs:  // PACKS_CC
+    case Intrinsic::s390_vpksfs:
+    case Intrinsic::s390_vpksgs:
+    case Intrinsic::s390_vpklsh:  // PACKLS
+    case Intrinsic::s390_vpklsf:
+    case Intrinsic::s390_vpklsg:
+    case Intrinsic::s390_vpklshs: // PACKLS_CC
+    case Intrinsic::s390_vpklsfs:
+    case Intrinsic::s390_vpklsgs:
+    case Intrinsic::s390_vpdi:
+    case Intrinsic::s390_vsldb:
+    case Intrinsic::s390_vperm:
+      computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 1);
+      break;
+    case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
+    case Intrinsic::s390_vuplhh:
+    case Intrinsic::s390_vuplhf:
+    case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
+    case Intrinsic::s390_vupllh:
+    case Intrinsic::s390_vupllf:
+      IsLogical = true;
+      LLVM_FALLTHROUGH;
+    case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
+    case Intrinsic::s390_vuphh:
+    case Intrinsic::s390_vuphf:
+    case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
+    case Intrinsic::s390_vuplhw:
+    case Intrinsic::s390_vuplf: {
+      SDValue SrcOp = Op.getOperand(1);
+      unsigned SrcBitWidth = SrcOp.getScalarValueSizeInBits();
+      Known = KnownBits(SrcBitWidth);
+      APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0);
+      DAG.computeKnownBits(SrcOp, Known, SrcDemE, Depth + 1);
+      if (IsLogical) {
+        Known = Known.zext(BitWidth);
+        Known.Zero.setBitsFrom(SrcBitWidth);
+      } else
+        Known = Known.sext(BitWidth);
+      break;
+    }
+    default:
+      break;
+    }
+  } else {
+    switch (Opcode) {
+    case SystemZISD::JOIN_DWORDS:
+    case SystemZISD::SELECT_CCMASK:
+      computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 0);
+      break;
+    case SystemZISD::REPLICATE: {
+      SDValue SrcOp = Op.getOperand(0);
+      DAG.computeKnownBits(SrcOp, Known, Depth + 1);
+      if (Known.getBitWidth() < BitWidth && isa<ConstantSDNode>(SrcOp))
+        Known = Known.sext(BitWidth); // VREPI sign extends the immedate.
+      break;
+    }
+    default:
+      break;
+    }
  }

-  default:
-    break;
+  // Known has the width of the source operand(s). Adjust if needed to match
+  // the passed bitwidth.
+  if (Known.getBitWidth() != BitWidth)
+    Known = Known.zextOrTrunc(BitWidth);
+}
+
+static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts,
+                                        const SelectionDAG &DAG, unsigned Depth,
+                                        unsigned OpNo) {
+  APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
+  unsigned LHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
+  if (LHS == 1) return 1; // Early out.
+  APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
+  unsigned RHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
+  if (RHS == 1) return 1; // Early out.
+  unsigned Common = std::min(LHS, RHS);
+  unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getScalarSizeInBits();
+  if (SrcBitWidth > VTBits) { // PACK
+    unsigned SrcExtraBits = SrcBitWidth - VTBits;
+    if (Common > SrcExtraBits)
+      return (Common - SrcExtraBits);
+    return 1;
  }
+  assert (SrcBitWidth == VTBits && "Expected operands of same bitwidth.");
+  return Common;
+}
+
+unsigned
+SystemZTargetLowering::ComputeNumSignBitsForTargetNode(
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    unsigned Depth) const {
+  if (Op.getResNo() != 0)
+    return 1;
+  unsigned Opcode = Op.getOpcode();
+  if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (Id) {
+    case Intrinsic::s390_vpksh:   // PACKS
+    case Intrinsic::s390_vpksf:
+    case Intrinsic::s390_vpksg:
+    case Intrinsic::s390_vpkshs:  // PACKS_CC
+    case Intrinsic::s390_vpksfs:
+    case Intrinsic::s390_vpksgs:
+    case Intrinsic::s390_vpklsh:  // PACKLS
+    case Intrinsic::s390_vpklsf:
+    case Intrinsic::s390_vpklsg:
+    case Intrinsic::s390_vpklshs: // PACKLS_CC
+    case Intrinsic::s390_vpklsfs:
+    case Intrinsic::s390_vpklsgs:
+    case Intrinsic::s390_vpdi:
+    case Intrinsic::s390_vsldb:
+    case Intrinsic::s390_vperm:
+      return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 1);
+    case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
+    case Intrinsic::s390_vuphh:
+    case Intrinsic::s390_vuphf:
+    case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
+    case Intrinsic::s390_vuplhw:
+    case Intrinsic::s390_vuplf: {
+      SDValue PackedOp = Op.getOperand(1);
+      APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 1);
+      unsigned Tmp = DAG.ComputeNumSignBits(PackedOp, SrcDemE, Depth + 1);
+      EVT VT = Op.getValueType();
+      unsigned VTBits = VT.getScalarSizeInBits();
+      Tmp += VTBits - PackedOp.getScalarValueSizeInBits();
+      return Tmp;
+    }
+    default:
+      break;
+    }
+  } else {
+    switch (Opcode) {
+    case SystemZISD::SELECT_CCMASK:
+      return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 0);
+    default:
+      break;
+    }
+  }
+
+  return 1;
 }

 //===----------------------------------------------------------------------===//
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@ -499,6 +499,12 @@ public:
                                     const SelectionDAG &DAG,
                                     unsigned Depth = 0) const override;

+  /// Determine the number of bits in the operation that are sign bits.
+  unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                           const APInt &DemandedElts,
+                                           const SelectionDAG &DAG,
+                                           unsigned Depth) const override;
+
  ISD::NodeType getExtendForAtomicOps() const override {
    return ISD::ANY_EXTEND;
  }
--- a/test/CodeGen/SystemZ/knownbits-intrinsics-binop.ll
+++ b/test/CodeGen/SystemZ/knownbits-intrinsics-binop.ll
@ -0,0 +1,460 @@
+; Test that DAGCombiner gets helped by computeKnownBitsForTargetNode() with
+; vector intrinsics.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s  | FileCheck %s
+
+declare {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64>, <2 x i64>)
+
+; PACKS_CC (operand elements are 0): i64 -> i32
+define <4 x i32> @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+  %extr = extractvalue {<4 x i32>, i32} %call, 0
+  %and = and <4 x i32> %extr, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKS_CC (operand elements are 1): i64 -> i32
+; NOTE: The vector AND is optimized away, but vrepig+vpksgs is used instead
+; of vrepif. Similarly for more test cases below.
+define <4 x i32> @f1() {
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepig %v0, 1
+; CHECK-NEXT:  vpksgs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> <i64 1, i64 1>, <2 x i64> <i64 1, i64 1>)
+  %extr = extractvalue {<4 x i32>, i32} %call, 0
+  %and = and <4 x i32> %extr, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKS_CC (operand elements are 0): i32 -> i16
+define <8 x i16> @f2() {
+; CHECK-LABEL: f2:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+                                                  <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  %extr = extractvalue {<8 x i16>, i32} %call, 0
+  %and = and <8 x i16> %extr, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKS_CC (operand elements are 1): i32 -> i16
+define <8 x i16> @f3() {
+; CHECK-LABEL: f3:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepif %v0, 1
+; CHECK-NEXT:  vpksfs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+                                                  <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+  %extr = extractvalue {<8 x i16>, i32} %call, 0
+  %and = and <8 x i16> %extr, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKS_CC (operand elements are 0): i16 -> i8
+define <16 x i8> @f4() {
+; CHECK-LABEL: f4:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  %extr = extractvalue {<16 x i8>, i32} %call, 0
+  %and = and <16 x i8> %extr, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+; PACKS_CC (operand elements are 1): i16 -> i8
+define <16 x i8> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepih %v0, 1
+; CHECK-NEXT:  vpkshs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>,
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  %extr = extractvalue {<16 x i8>, i32} %call, 0
+  %and = and <16 x i8> %extr, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+declare {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64>, <2 x i64>)
+
+; PACKLS_CC (operand elements are 0): i64 -> i32
+define <4 x i32> @f6() {
+; CHECK-LABEL: f6:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+  %extr = extractvalue {<4 x i32>, i32} %call, 0
+  %and = and <4 x i32> %extr, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKLS_CC (operand elements are 1): i64 -> i32
+define <4 x i32> @f7() {
+; CHECK-LABEL: f7:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepig %v0, 1
+; CHECK-NEXT:  vpklsgs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> <i64 1, i64 1>, <2 x i64> <i64 1, i64 1>)
+  %extr = extractvalue {<4 x i32>, i32} %call, 0
+  %and = and <4 x i32> %extr, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKLS_CC (operand elements are 0): i32 -> i16
+define <8 x i16> @f8() {
+; CHECK-LABEL: f8:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+                                                  <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  %extr = extractvalue {<8 x i16>, i32} %call, 0
+  %and = and <8 x i16> %extr, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKLS_CC (operand elements are 1): i32 -> i16
+define <8 x i16> @f9() {
+; CHECK-LABEL: f9:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepif %v0, 1
+; CHECK-NEXT:  vpklsfs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+                                                  <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+  %extr = extractvalue {<8 x i16>, i32} %call, 0
+  %and = and <8 x i16> %extr, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKLS_CC (operand elements are 0): i16 -> i8
+define <16 x i8> @f10() {
+; CHECK-LABEL: f10:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  %extr = extractvalue {<16 x i8>, i32} %call, 0
+  %and = and <16 x i8> %extr, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+; PACKLS_CC (operand elements are 1): i16 -> i8
+define <16 x i8> @f11() {
+; CHECK-LABEL: f11:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepih %v0, 1
+; CHECK-NEXT:  vpklshs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>,
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  %extr = extractvalue {<16 x i8>, i32} %call, 0
+  %and = and <16 x i8> %extr, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+declare <16 x i8> @llvm.s390.vpksh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpksf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpksg(<2 x i64>, <2 x i64>)
+
+; PACKS (operand elements are 0): i64 -> i32
+define <4 x i32> @f12() {
+; CHECK-LABEL: f12:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call <4 x i32> @llvm.s390.vpksg(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+  %and = and <4 x i32> %call, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKS (operand elements are 1): i64 -> i32
+define <4 x i32> @f13() {
+; CHECK-LABEL: f13:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepig %v0, 1
+; CHECK-NEXT:  vpksg %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <4 x i32> @llvm.s390.vpksg(<2 x i64> <i64 1, i64 1>, <2 x i64> <i64 1, i64 1>)
+  %and = and <4 x i32> %call, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKS (operand elements are 0): i32 -> i16
+define <8 x i16> @f14() {
+; CHECK-LABEL: f14:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call <8 x i16> @llvm.s390.vpksf(<4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+                                          <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  %and = and <8 x i16> %call, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKS (operand elements are 1): i32 -> i16
+define <8 x i16> @f15() {
+; CHECK-LABEL: f15:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepif %v0, 1
+; CHECK-NEXT:  vpksf %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <8 x i16> @llvm.s390.vpksf(<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+                                          <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+  %and = and <8 x i16> %call, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKS (operand elements are 0): i16 -> i8
+define <16 x i8> @f16() {
+; CHECK-LABEL: f16:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call <16 x i8> @llvm.s390.vpksh(
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  %and = and <16 x i8> %call, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+; PACKS (operand elements are 1): i16 -> i8
+define <16 x i8> @f17() {
+; CHECK-LABEL: f17:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepih %v0, 1
+; CHECK-NEXT:  vpksh %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <16 x i8> @llvm.s390.vpksh(
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>,
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  %and = and <16 x i8> %call, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+declare <16 x i8> @llvm.s390.vpklsh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpklsf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpklsg(<2 x i64>, <2 x i64>)
+
+; PACKLS (operand elements are 0): i64 -> i32
+define <4 x i32> @f18() {
+; CHECK-LABEL: f18:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+  %and = and <4 x i32> %call, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKLS (operand elements are 1): i64 -> i32
+define <4 x i32> @f19() {
+; CHECK-LABEL: f19:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepig %v0, 1
+; CHECK-NEXT:  vpklsg %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> <i64 1, i64 1>, <2 x i64> <i64 1, i64 1>)
+  %and = and <4 x i32> %call, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; PACKLS (operand elements are 0): i32 -> i16
+define <8 x i16> @f20() {
+; CHECK-LABEL: f20:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+                                           <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  %and = and <8 x i16> %call, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKLS (operand elements are 1): i32 -> i16
+define <8 x i16> @f21() {
+; CHECK-LABEL: f21:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepif %v0, 1
+; CHECK-NEXT:  vpklsf %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+                                           <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+  %and = and <8 x i16> %call, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; PACKLS (operand elements are 0): i16 -> i8
+define <16 x i8> @f22() {
+; CHECK-LABEL: f22:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %call = call <16 x i8> @llvm.s390.vpklsh(
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
+                <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  %and = and <16 x i8> %call, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+; PACKLS (operand elements are 1): i16 -> i8
+define <16 x i8> @f23() {
+; CHECK-LABEL: f23:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vrepih %v0, 1
+; CHECK-NEXT:  vpklsh %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <16 x i8> @llvm.s390.vpklsh(
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>,
+                <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+  %and = and <16 x i8> %call, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
+declare <2 x i64> @llvm.s390.vpdi(<2 x i64>, <2 x i64>, i32)
+
+; VPDI (operand elements are 0):
+define <2 x i64> @f24() {
+; CHECK-LABEL: f24:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+  %perm = call <2 x i64> @llvm.s390.vpdi(<2 x i64> <i64 0, i64 0>,
+                                         <2 x i64> <i64 0, i64 0>, i32 0)
+  %res = and <2 x i64> %perm, <i64 1, i64 1>
+  ret <2 x i64> %res
+}
+
+; VPDI (operand elements are 1):
+define <2 x i64> @f25() {
+; CHECK-LABEL: f25:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepig %v0, 1
+; CHECK-NEXT: vpdi %v24, %v0, %v0, 0
+; CHECK-NEXT: br %r14
+  %perm = call <2 x i64> @llvm.s390.vpdi(<2 x i64> <i64 1, i64 1>,
+                                         <2 x i64> <i64 1, i64 1>, i32 0)
+  %res = and <2 x i64> %perm, <i64 1, i64 1>
+  ret <2 x i64> %res
+}
+
+declare <16 x i8> @llvm.s390.vsldb(<16 x i8>, <16 x i8>, i32)
+
+; VSLDB (operand elements are 0):
+define <16 x i8> @f26() {
+; CHECK-LABEL: f26:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+  %shfd = call <16 x i8> @llvm.s390.vsldb(<16 x i8>
+                 <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                  i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8>
+                 <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                  i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
+                  i32 1)
+
+  %res = and <16 x i8> %shfd, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %res
+}
+
+; VSLDB (operand elements are 1):
+define <16 x i8> @f27() {
+; CHECK-LABEL: f27:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vrepib %v0, 1
+; CHECK-NEXT: vsldb %v24, %v0, %v0, 1
+; CHECK-NEXT: br %r14
+  %shfd = call <16 x i8> @llvm.s390.vsldb(<16 x i8>
+                 <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                  i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>
+                 <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                  i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>,
+                  i32 1)
+
+  %res = and <16 x i8> %shfd, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %res
+}
+
+; Test that intrinsic CC result is recognized.
+define i32 @f28(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: f28:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: lhi %r2, 0
+; CHECK-NEXT: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b)
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  %res = and i32 %cc, -4
+  ret i32 %res
+}
+
+declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>)
+
+; Test VPERM (operand elements are 0):
+define <16 x i8> @f29() {
+; CHECK-LABEL: f29:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: br %r14
+  %perm = call <16 x i8> @llvm.s390.vperm(
+                  <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                             i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
+                  <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                             i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
+                  <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                             i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+  %res = and <16 x i8> %perm, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %res
+}
+
+; Test VPERM (operand elements are 1):
+define <16 x i8> @f30() {
+; CHECK-LABEL: f30:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: vgbm %v0, 0
+; CHECK-NEXT: vrepib %v1, 1
+; CHECK-NEXT: vperm %v24, %v1, %v1, %v0
+; CHECK-NEXT: br %r14
+  %perm = call <16 x i8> @llvm.s390.vperm(
+                  <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                             i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>,
+                  <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                             i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>,
+                  <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                             i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+  %res = and <16 x i8> %perm, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %res
+}
--- a/test/CodeGen/SystemZ/knownbits-intrinsics-unpack.ll
+++ b/test/CodeGen/SystemZ/knownbits-intrinsics-unpack.ll
@ -0,0 +1,384 @@
+; Test that DAGCombiner gets helped by computeKnownBitsForTargetNode() with
+; vector intrinsics.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s  | FileCheck %s
+
+declare <8 x i16> @llvm.s390.vuphb(<16 x i8>)
+declare <8 x i16> @llvm.s390.vuplhb(<16 x i8>)
+
+; VUPHB (used operand elements are 0)
+define <8 x i16> @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuphb(<16 x i8>
+                                         <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                                          i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; VUPHB (used operand elements are 1)
+; NOTE: The AND is optimized away, but instead of replicating '1' into <8 x
+; i16>, the original vector constant is put in the constant pool and then
+; unpacked (repeated in more test cases below).
+define <8 x i16> @f1() {
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuphb %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuphb(<16 x i8>
+                                         <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                                          i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; VUPLHB (used operand elements are 0)
+define <8 x i16> @f2() {
+; CHECK-LABEL: f2:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuplhb(<16 x i8>
+                                          <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                                           i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; VUPLHB (used operand elements are 1)
+define <8 x i16> @f3() {
+; CHECK-LABEL: f3:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuplhb %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuplhb(<16 x i8>
+                                          <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                                           i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+declare <4 x i32> @llvm.s390.vuphh(<8 x i16>)
+declare <4 x i32> @llvm.s390.vuplhh(<8 x i16>)
+
+; VUPHH (used operand elements are 0)
+define <4 x i32> @f4() {
+; CHECK-LABEL: f4:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuphh(<8 x i16>
+                                         <i16 0, i16 0, i16 0, i16 0,
+                                          i16 1, i16 1, i16 1, i16 1>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; VUPHH (used operand elements are 1)
+define <4 x i32> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuphh %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuphh(<8 x i16>
+                                         <i16 1, i16 1, i16 1, i16 1,
+                                          i16 0, i16 0, i16 0, i16 0>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; VUPLHH (used operand elements are 0)
+define <4 x i32> @f6() {
+; CHECK-LABEL: f6:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuplhh(<8 x i16>
+                                          <i16 0, i16 0, i16 0, i16 0,
+                                           i16 1, i16 1, i16 1, i16 1>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; VUPLHH (used operand elements are 1)
+define <4 x i32> @f7() {
+; CHECK-LABEL: f7:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuplhh %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuplhh(<8 x i16>
+                                          <i16 1, i16 1, i16 1, i16 1,
+                                           i16 0, i16 0, i16 0, i16 0>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+declare <2 x i64> @llvm.s390.vuphf(<4 x i32>)
+declare <2 x i64> @llvm.s390.vuplhf(<4 x i32>)
+
+; VUPHF (used operand elements are 0)
+define <2 x i64> @f8() {
+; CHECK-LABEL: f8:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; VUPHF (used operand elements are 1)
+define <2 x i64> @f9() {
+; CHECK-LABEL: f9:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuphf %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; VUPLHF (used operand elements are 0)
+define <2 x i64> @f10() {
+; CHECK-LABEL: f10:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; VUPLHF (used operand elements are 1)
+define <2 x i64> @f11() {
+; CHECK-LABEL: f11:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuplhf %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+declare <8 x i16> @llvm.s390.vuplb(<16 x i8>)
+declare <8 x i16> @llvm.s390.vupllb(<16 x i8>)
+
+; VUPLB (used operand elements are 0)
+define <8 x i16> @f12() {
+; CHECK-LABEL: f12:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuplb(<16 x i8>
+                                         <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                                          i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; VUPLB (used operand elements are 1)
+define <8 x i16> @f13() {
+; CHECK-LABEL: f13:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuplb %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuplb(<16 x i8>
+                                         <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                                          i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; VUPLLB (used operand elements are 0)
+define <8 x i16> @f14() {
+; CHECK-LABEL: f14:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vupllb(<16 x i8>
+                                         <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                                          i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+; VUPLLB (used operand elements are 1)
+define <8 x i16> @f15() {
+; CHECK-LABEL: f15:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vupllb %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vupllb(<16 x i8>
+                                         <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                                          i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+  %and = and <8 x i16> %unp, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %and
+}
+
+declare <4 x i32> @llvm.s390.vuplhw(<8 x i16>)
+declare <4 x i32> @llvm.s390.vupllh(<8 x i16>)
+
+; VUPLHW (used operand elements are 0)
+define <4 x i32> @f16() {
+; CHECK-LABEL: f16:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuplhw(<8 x i16>
+                                          <i16 1, i16 1, i16 1, i16 1,
+                                           i16 0, i16 0, i16 0, i16 0>)
+
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; VUPLHW (used operand elements are 1)
+define <4 x i32> @f17() {
+; CHECK-LABEL: f17:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuplhw %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuplhw(<8 x i16>
+                                          <i16 0, i16 0, i16 0, i16 0,
+                                           i16 1, i16 1, i16 1, i16 1>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; VUPLLH (used operand elements are 0)
+define <4 x i32> @f18() {
+; CHECK-LABEL: f18:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vupllh(<8 x i16>
+                                          <i16 1, i16 1, i16 1, i16 1,
+                                           i16 0, i16 0, i16 0, i16 0>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+; VUPLLH (used operand elements are 1)
+define <4 x i32> @f19() {
+; CHECK-LABEL: f19:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vupllh %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vupllh(<8 x i16>
+                                          <i16 0, i16 0, i16 0, i16 0,
+                                           i16 1, i16 1, i16 1, i16 1>)
+  %and = and <4 x i32> %unp, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
+declare <2 x i64> @llvm.s390.vuplf(<4 x i32>)
+declare <2 x i64> @llvm.s390.vupllf(<4 x i32>)
+
+; VUPLF (used operand elements are 0)
+define <2 x i64> @f20() {
+; CHECK-LABEL: f20:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuplf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; VUPLF (used operand elements are 1)
+define <2 x i64> @f21() {
+; CHECK-LABEL: f21:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vuplf %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuplf(<4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; VUPLLF (used operand elements are 0)
+define <2 x i64> @f22() {
+; CHECK-LABEL: f22:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vupllf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; VUPLLF (used operand elements are 1)
+define <2 x i64> @f23() {
+; CHECK-LABEL: f23:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  larl %r1, .LCPI
+; CHECK-NEXT:  vl %v0, 0(%r1)
+; CHECK-NEXT:  vupllf %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vupllf(<4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+  %and = and <2 x i64> %unp, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
+; Test that signed unpacking of positive elements gives known zeros in high part.
+define <2 x i64> @f24() {
+; CHECK-LABEL: f24:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 1, i32 1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 -4294967296, ; = 0xffffffff00000000
+                              i64 -4294967296>
+  ret <2 x i64> %and
+}
+
+; Test that signed unpacking of negative elements gives known ones in high part.
+define <2 x i64> @f25() {
+; CHECK-LABEL: f25:
+; CHECK-LABEL: # %bb.0:
+;                         61680 = 0xf0f0
+; CHECK-NEXT:  vgbm %v24, 61680
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 -1, i32 -1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 -4294967296, ; = 0xffffffff00000000
+                              i64 -4294967296>
+  ret <2 x i64> %and
+}
+
+; Test that logical unpacking of negative elements gives known zeros in high part.
+define <2 x i64> @f26() {
+; CHECK-LABEL: f26:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT:  vgbm %v24, 0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> <i32 -1, i32 -1, i32 0, i32 0>)
+  %and = and <2 x i64> %unp, <i64 -4294967296, ; = 0xffffffff00000000
+                              i64 -4294967296>
+  ret <2 x i64> %and
+}
--- a/test/CodeGen/SystemZ/knownbits.ll
+++ b/test/CodeGen/SystemZ/knownbits.ll
@ -0,0 +1,51 @@
+; Test that DAGCombiner gets helped by computeKnownBitsForTargetNode().
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s  | FileCheck %s
+
+; SystemZISD::REPLICATE
+define i32 @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vlgvf
+; CHECK-NOT:   lhi %r2, 0
+; CHECK-NOT:   chi %r0, 0
+; CHECK-NOT:   lochilh %r2, 1
+; CHECK: br %r14
+  %cmp0 = icmp ne <4 x i32> undef, zeroinitializer
+  %zxt0 = zext <4 x i1> %cmp0 to <4 x i32>
+  %ext0 = extractelement <4 x i32> %zxt0, i32 3
+  br label %exit
+
+exit:
+; The vector icmp+zext involves a REPLICATE of 1's. If KnownBits reflects
+; this, DAGCombiner can see that the i32 icmp and zext here are not needed.
+  %cmp1 = icmp ne i32 %ext0, 0
+  %zxt1 = zext i1 %cmp1 to i32
+  ret i32 %zxt1
+}
+
+; SystemZISD::JOIN_DWORDS (and REPLICATE)
+define void @f1() {
+; The DAG XOR has JOIN_DWORDS and REPLICATE operands. With KnownBits properly set
+; for both these nodes, ICMP is used instead of TM during lowering because
+; adjustForRedundantAnd() succeeds.
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NOT:   tmll
+; CHECK-NOT:   jne
+; CHECK:       cijlh
+  %1 = load i16, i16* null, align 2
+  %2 = icmp eq i16 %1, 0
+  %3 = insertelement <2 x i1> undef, i1 %2, i32 0
+  %4 = insertelement <2 x i1> %3, i1 true, i32 1
+  %5 = xor <2 x i1> %4, <i1 true, i1 true>
+  %6 = extractelement <2 x i1> %5, i32 0
+  %7 = or i1 %6, undef
+  br i1 %7, label %9, label %8
+
+; <label>:8:                                      ; preds = %0
+  unreachable
+
+; <label>:9:                                      ; preds = %0
+  unreachable
+}
--- a/test/CodeGen/SystemZ/signbits-intrinsics-binop.ll
+++ b/test/CodeGen/SystemZ/signbits-intrinsics-binop.ll
@ -0,0 +1,236 @@
+; Test that DAGCombiner gets helped by ComputeNumSignBitsForTargetNode() with
+; vector intrinsics.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s  | FileCheck %s
+
+declare {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64>, <2 x i64>)
+
+; PACKS_CC: i64 -> i32
+define <4 x i32> @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpksgs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> <i64 0, i64 1>, <2 x i64> <i64 0, i64 1>)
+  %extr = extractvalue {<4 x i32>, i32} %call, 0
+  %trunc = trunc <4 x i32> %extr to <4 x i16>
+  %ret = sext <4 x i16> %trunc to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; PACKS_CC: i32 -> i16
+define <8 x i16> @f1() {
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpksfs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> <i32 0, i32 1, i32 1, i32 0>,
+                                                  <4 x i32> <i32 0, i32 1, i32 1, i32 0>)
+  %extr = extractvalue {<8 x i16>, i32} %call, 0
+  %trunc = trunc <8 x i16> %extr to <8 x i8>
+  %ret = sext <8 x i8> %trunc to <8 x i16>
+  ret <8 x i16> %ret
+}
+
+; PACKS_CC: i16 -> i8
+define <16 x i8> @f2() {
+; CHECK-LABEL: f2:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpkshs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>,
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>)
+  %extr = extractvalue {<16 x i8>, i32} %call, 0
+  %trunc = trunc <16 x i8> %extr to <16 x i4>
+  %ret = sext <16 x i4> %trunc to <16 x i8>
+  ret <16 x i8> %ret
+}
+
+declare {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64>, <2 x i64>)
+
+; PACKLS_CC: i64 -> i32
+define <4 x i32> @f3() {
+; CHECK-LABEL: f3:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpklsgs %v24, %v1, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> <i64 0, i64 1>, <2 x i64> <i64 1, i64 0>)
+  %extr = extractvalue {<4 x i32>, i32} %call, 0
+  %trunc = trunc <4 x i32> %extr to <4 x i16>
+  %ret = sext <4 x i16> %trunc to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; PACKLS_CC: i32 -> i16
+define <8 x i16> @f4() {
+; CHECK-LABEL: f4:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpklsfs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> <i32 0, i32 1, i32 1, i32 0>,
+                                                   <4 x i32> <i32 0, i32 1, i32 1, i32 0>)
+  %extr = extractvalue {<8 x i16>, i32} %call, 0
+  %trunc = trunc <8 x i16> %extr to <8 x i8>
+  %ret = sext <8 x i8> %trunc to <8 x i16>
+  ret <8 x i16> %ret
+}
+
+; PACKLS_CC: i16 -> i8
+define <16 x i8> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpklshs %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>,
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>)
+  %extr = extractvalue {<16 x i8>, i32} %call, 0
+  %trunc = trunc <16 x i8> %extr to <16 x i4>
+  %ret = sext <16 x i4> %trunc to <16 x i8>
+  ret <16 x i8> %ret
+}
+
+declare <16 x i8> @llvm.s390.vpksh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpksf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpksg(<2 x i64>, <2 x i64>)
+
+; PACKS: i64 -> i32
+define <4 x i32> @f6() {
+; CHECK-LABEL: f6:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpksg %v24, %v1, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <4 x i32> @llvm.s390.vpksg(<2 x i64> <i64 0, i64 1>, <2 x i64> <i64 1, i64 0>)
+  %trunc = trunc <4 x i32> %call to <4 x i16>
+  %ret = sext <4 x i16> %trunc to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; PACKS: i32 -> i16
+define <8 x i16> @f7() {
+; CHECK-LABEL: f7:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpksf %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <8 x i16> @llvm.s390.vpksf(<4 x i32> <i32 0, i32 1, i32 1, i32 0>,
+                                          <4 x i32> <i32 0, i32 1, i32 1, i32 0>)
+  %trunc = trunc <8 x i16> %call to <8 x i8>
+  %ret = sext <8 x i8> %trunc to <8 x i16>
+  ret <8 x i16> %ret
+}
+
+; PACKS: i16 -> i8
+define <16 x i8> @f8() {
+; CHECK-LABEL: f8:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpksh %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <16 x i8> @llvm.s390.vpksh(
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>,
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>)
+  %trunc = trunc <16 x i8> %call to <16 x i4>
+  %ret = sext <16 x i4> %trunc to <16 x i8>
+  ret <16 x i8> %ret
+}
+
+declare <16 x i8> @llvm.s390.vpklsh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpklsf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpklsg(<2 x i64>, <2 x i64>)
+
+; PACKLS: i64 -> i32
+define <4 x i32> @f9() {
+; CHECK-LABEL: f9:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpklsg %v24, %v1, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> <i64 0, i64 1>, <2 x i64> <i64 1, i64 0>)
+  %trunc = trunc <4 x i32> %call to <4 x i16>
+  %ret = sext <4 x i16> %trunc to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; PACKLS: i32 -> i16
+define <8 x i16> @f10() {
+; CHECK-LABEL: f10:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpklsf %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> <i32 0, i32 1, i32 1, i32 0>,
+                                           <4 x i32> <i32 0, i32 1, i32 1, i32 0>)
+  %trunc = trunc <8 x i16> %call to <8 x i8>
+  %ret = sext <8 x i8> %trunc to <8 x i16>
+  ret <8 x i16> %ret
+}
+
+; PACKLS: i16 -> i8
+define <16 x i8> @f11() {
+; CHECK-LABEL: f11:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vpklsh %v24, %v0, %v0
+; CHECK-NEXT:  br %r14
+  %call = call <16 x i8> @llvm.s390.vpklsh(
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>,
+                <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1>)
+  %trunc = trunc <16 x i8> %call to <16 x i4>
+  %ret = sext <16 x i4> %trunc to <16 x i8>
+  ret <16 x i8> %ret
+}
+
+declare <2 x i64> @llvm.s390.vpdi(<2 x i64>, <2 x i64>, i32)
+
+; VPDI:
+define <2 x i64> @f12() {
+; CHECK-LABEL: f12:
+; CHECK-LABEL: # %bb.0:
+; CHECK:      vpdi %v24, %v1, %v0, 0
+; CHECK-NEXT: br %r14
+  %perm = call <2 x i64> @llvm.s390.vpdi(<2 x i64> <i64 0, i64 1>,
+                                         <2 x i64> <i64 1, i64 0>, i32 0)
+  %trunc = trunc <2 x i64> %perm to <2 x i32>
+  %ret = sext <2 x i32> %trunc to <2 x i64>
+  ret <2 x i64> %ret
+}
+
+declare <16 x i8> @llvm.s390.vsldb(<16 x i8>, <16 x i8>, i32)
+
+; VSLDB:
+define <16 x i8> @f13() {
+; CHECK-LABEL: f13:
+; CHECK-LABEL: # %bb.0:
+; CHECK:      vsldb %v24, %v0, %v0, 1
+; CHECK-NEXT: br %r14
+  %shfd = call <16 x i8> @llvm.s390.vsldb(<16 x i8>
+                 <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+                  i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>, <16 x i8>
+                 <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+                  i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>,
+                  i32 1)
+  %trunc = trunc <16 x i8> %shfd to <16 x i4>
+  %ret = sext <16 x i4> %trunc to <16 x i8>
+  ret <16 x i8> %ret
+}
+
+declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>)
+
+; Test VPERM:
+define <16 x i8> @f14() {
+; CHECK-LABEL: f14:
+; CHECK-LABEL: # %bb.0:
+; CHECK:      vperm %v24, %v0, %v0, %v0
+; CHECK-NEXT: br %r14
+  %perm = call <16 x i8> @llvm.s390.vperm(
+                  <16 x i8> <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+                             i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>,
+                  <16 x i8> <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+                             i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>,
+                  <16 x i8> <i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1,
+                             i8 0, i8 0, i8 1, i8 1, i8 0, i8 1, i8 1, i8 1>)
+  %trunc = trunc <16 x i8> %perm to <16 x i4>
+  %ret = sext <16 x i4> %trunc to <16 x i8>
+  ret <16 x i8> %ret
+}
--- a/test/CodeGen/SystemZ/signbits-intrinsics-unpack.ll
+++ b/test/CodeGen/SystemZ/signbits-intrinsics-unpack.ll
@ -0,0 +1,97 @@
+; Test that DAGCombiner gets helped by ComputeNumSignBitsForTargetNode() with
+; vector intrinsics.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s  | FileCheck %s
+
+declare <8 x i16> @llvm.s390.vuphb(<16 x i8>)
+
+; VUPHB
+define <8 x i16> @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vuphb %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuphb(<16 x i8>
+                                         <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1,
+                                          i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+  %trunc = trunc <8 x i16> %unp to <8 x i8>
+  %ret = sext <8 x i8> %trunc to <8 x i16>
+  ret <8 x i16> %ret
+}
+
+declare <4 x i32> @llvm.s390.vuphh(<8 x i16>)
+
+; VUPHH
+define <4 x i32> @f1() {
+; CHECK-LABEL: f1:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vuphh %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuphh(<8 x i16>
+                                         <i16 0, i16 1, i16 0, i16 1,
+                                          i16 0, i16 1, i16 0, i16 1>)
+  %trunc = trunc <4 x i32> %unp to <4 x i16>
+  %ret = sext <4 x i16> %trunc to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+declare <2 x i64> @llvm.s390.vuphf(<4 x i32>)
+
+; VUPHF
+define <2 x i64> @f2() {
+; CHECK-LABEL: f2:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vuphf %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuphf(<4 x i32> <i32 0, i32 1, i32 0, i32 1>)
+  %trunc = trunc <2 x i64> %unp to <2 x i32>
+  %ret = sext <2 x i32> %trunc to <2 x i64>
+  ret <2 x i64> %ret
+}
+
+declare <8 x i16> @llvm.s390.vuplb(<16 x i8>)
+
+; VUPLB
+define <8 x i16> @f3() {
+; CHECK-LABEL: f3:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vuplb %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <8 x i16> @llvm.s390.vuplb(<16 x i8>
+                                         <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1,
+                                          i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+  %trunc = trunc <8 x i16> %unp to <8 x i8>
+  %ret = sext <8 x i8> %trunc to <8 x i16>
+  ret <8 x i16> %ret
+}
+
+declare <4 x i32> @llvm.s390.vuplhw(<8 x i16>)
+
+; VUPLHW
+define <4 x i32> @f4() {
+; CHECK-LABEL: f4:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vuplhw %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <4 x i32> @llvm.s390.vuplhw(<8 x i16>
+                                          <i16 1, i16 0, i16 1, i16 0,
+                                           i16 1, i16 0, i16 1, i16 0>)
+  %trunc = trunc <4 x i32> %unp to <4 x i16>
+  %ret = sext <4 x i16> %trunc to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+declare <2 x i64> @llvm.s390.vuplf(<4 x i32>)
+
+; VUPLF
+define <2 x i64> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-LABEL: # %bb.0:
+; CHECK:       vuplf %v24, %v0
+; CHECK-NEXT:  br %r14
+  %unp = call <2 x i64> @llvm.s390.vuplf(<4 x i32> <i32 1, i32 0, i32 1, i32 0>)
+  %trunc = trunc <2 x i64> %unp to <2 x i32>
+  %ret = sext <2 x i32> %trunc to <2 x i64>
+  ret <2 x i64> %ret
+}
+
--- a/test/CodeGen/SystemZ/signbits.ll
+++ b/test/CodeGen/SystemZ/signbits.ll
@ -0,0 +1,36 @@
+; Test that ComputeNumSignBitsForTargetNode() (SELECT_CCMASK) will help
+; DAGCombiner so that it knows that %sel0 is already sign extended.
+;
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 -debug-only=isel < %s 2>&1 | FileCheck %s
+
+%0 = type <{ %1*, i16, [6 x i8] }>
+%1 = type { i32 (...)** }
+
+define signext i16 @fun(%0* %Arg0, i16 signext %Arg1) {
+entry:
+  br i1 undef, label %lab0, label %lab1
+
+lab0:
+  %icmp0 = icmp eq i32 undef, 0
+  %sel0 = select i1 %icmp0, i16 %Arg1, i16 1
+  br label %lab1
+
+lab1:
+; CHECK: *** MachineFunction at end of ISel ***
+; CHECK-LABEL: bb.2.lab1:
+; CHECK-NOT:   LHR
+; CHECK:       BRC
+  %phi0 = phi i16 [ 2, %entry ], [ %sel0, %lab0 ]
+  %sext0 = sext i16 %phi0 to i32
+  br i1 undef, label %lab2, label %lab3
+
+lab2:
+  %and0 = and i32 %sext0, 8
+  %icmp1 = icmp eq i32 %and0, 0
+  %sel1 = select i1 %icmp1, i16 %phi0, i16 4
+  ret i16 %sel1
+
+lab3:
+  ret i16 8
+}
+
--- a/test/CodeGen/SystemZ/vec-trunc-to-i1.ll
+++ b/test/CodeGen/SystemZ/vec-trunc-to-i1.ll
@ -17,8 +17,7 @@ define void @pr32275(<4 x i8> %B15) {
 ; CHECK-NEXT:    vlvgf [[REG2]], [[REG3]], 2
 ; CHECK-NEXT:    vn [[REG2]], [[REG2]], [[REG0]]
 ; CHECK-NEXT:    vlgvf [[REG4:%r[0-9]]], [[REG2]], 3
-; CHECK-NEXT:    tmll [[REG4]], 1
-; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:    cijlh [[REG4]], 0, .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %CF36
 ; CHECK-NEXT:    br %r14
 BB: