From c6a7e261b5716ed2e845bd3d5276422b3fc093dd Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sat, 12 Sep 2020 11:42:18 -0700
Subject: [PATCH] [SelectionDAG][X86][ARM][AArch64] Add ISD opcode for
 __builtin_parity. Expand it to shifts and xors.

Clang emits (and (ctpop X), 1) for __builtin_parity. If ctpop
isn't natively supported by the target, this leads to poor codegen
due to the expansion of ctpop being more complex than what is needed
for parity.

This adds a DAG combine to convert the pattern to ISD::PARITY
before operation legalization. Type legalization is updated
to handled Expanding and Promoting this operation. If after type
legalization, CTPOP is supported for this type, LegalizeDAG will
turn it back into CTPOP+AND. Otherwise LegalizeDAG will emit a
series of shifts and xors followed by an AND with 1.

I've avoided vectors in this patch to avoid more legalization
complexity for this patch.

X86 previously had a custom DAG combiner for this. This is now
moved to Custom lowering for the new opcode. There is a minor
regression in vector-reduce-xor-bool.ll, but a follow up patch
can easily fix that.

Fixes PR47433

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D87209
---
 include/llvm/CodeGen/ISDOpcodes.h             |   1 +
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |  19 ++
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |  26 +++
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  21 +-
 lib/CodeGen/SelectionDAG/LegalizeTypes.h      |   3 +-
 .../SelectionDAG/SelectionDAGDumper.cpp       |   1 +
 lib/CodeGen/TargetLoweringBase.cpp            |   1 +
 lib/Target/X86/X86ISelLowering.cpp            | 146 ++++++--------
 test/CodeGen/AArch64/parity.ll                | 161 +++++++++++++++
 test/CodeGen/ARM/parity.ll                    | 162 +++++++++++++++
 test/CodeGen/X86/parity.ll                    | 189 +++++++++++++++++-
 test/CodeGen/X86/vector-reduce-xor-bool.ll    |  12 +-
 12 files changed, 642 insertions(+), 100 deletions(-)
 create mode 100644 test/CodeGen/AArch64/parity.ll
 create mode 100644 test/CodeGen/ARM/parity.ll

diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index ae08d6e9313..ba5a5d6e875 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -598,6 +598,7 @@ enum NodeType {
   CTLZ,
   CTPOP,
   BITREVERSE,
+  PARITY,
 
   /// Bit counting operators with an undefined result for zero inputs.
   CTTZ_ZERO_UNDEF,
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index eaa70444578..3aaf5e01d26 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5574,6 +5574,25 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
       return V;
 
+  // fold (and (ctpop X), 1) -> parity X
+  // Only do this before op legalization as it might be turned back into ctpop.
+  // TODO: Support vectors?
+  if (!LegalOperations && isOneConstant(N1) && N0.hasOneUse()) {
+    SDValue Tmp = N0;
+
+    // It's possible the ctpop has been truncated, but since we only care about
+    // the LSB we can look through it.
+    if (Tmp.getOpcode() == ISD::TRUNCATE && Tmp.getOperand(0).hasOneUse())
+      Tmp = Tmp.getOperand(0);
+
+    if (Tmp.getOpcode() == ISD::CTPOP) {
+      SDLoc dl(N);
+      SDValue Parity =
+          DAG.getNode(ISD::PARITY, dl, Tmp.getValueType(), Tmp.getOperand(0));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Parity);
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7751ebb7705..71ba228b53f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -181,6 +181,7 @@ private:
 
   SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl);
   SDValue ExpandBSWAP(SDValue Op, const SDLoc &dl);
+  SDValue ExpandPARITY(SDValue Op, const SDLoc &dl);
 
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
@@ -2785,6 +2786,28 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
   }
 }
 
+/// Open code the operations for PARITY of the specified operation.
+SDValue SelectionDAGLegalize::ExpandPARITY(SDValue Op, const SDLoc &dl) {
+  EVT VT = Op.getValueType();
+  EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  unsigned Sz = VT.getScalarSizeInBits();
+
+  // If CTPOP is legal, use it. Otherwise use shifts and xor.
+  SDValue Result;
+  if (TLI.isOperationLegal(ISD::CTPOP, VT)) {
+    Result = DAG.getNode(ISD::CTPOP, dl, VT, Op);
+  } else {
+    Result = Op;
+    for (unsigned i = Log2_32_Ceil(Sz); i != 0;) {
+      SDValue Shift = DAG.getNode(ISD::SRL, dl, VT, Result,
+                                  DAG.getConstant(1 << (--i), dl, ShVT));
+      Result = DAG.getNode(ISD::XOR, dl, VT, Result, Shift);
+    }
+  }
+
+  return DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(1, dl, VT));
+}
+
 bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to expand node\n");
   SmallVector<SDValue, 8> Results;
@@ -2816,6 +2839,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::BSWAP:
     Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
     break;
+  case ISD::PARITY:
+    Results.push_back(ExpandPARITY(Node->getOperand(0), dl));
+    break;
   case ISD::FRAMEADDR:
   case ISD::RETURNADDR:
   case ISD::FRAME_TO_ARGS_OFFSET:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index bfe1b365efc..0000fcb1dde 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -62,7 +62,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTLZ:        Res = PromoteIntRes_CTLZ(N); break;
-  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP(N); break;
+  case ISD::PARITY:
+  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP_PARITY(N); break;
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
   case ISD::EXTRACT_VECTOR_ELT:
@@ -503,10 +504,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
                       NVT));
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) {
-  // Zero extend to the promoted type and do the count there.
+SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) {
+  // Zero extend to the promoted type and do the count or parity there.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-  return DAG.getNode(ISD::CTPOP, SDLoc(N), Op.getValueType(), Op);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
@@ -1980,6 +1981,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::AssertZext:  ExpandIntRes_AssertZext(N, Lo, Hi); break;
   case ISD::BITREVERSE:  ExpandIntRes_BITREVERSE(N, Lo, Hi); break;
   case ISD::BSWAP:       ExpandIntRes_BSWAP(N, Lo, Hi); break;
+  case ISD::PARITY:      ExpandIntRes_PARITY(N, Lo, Hi); break;
   case ISD::Constant:    ExpandIntRes_Constant(N, Lo, Hi); break;
   case ISD::ABS:         ExpandIntRes_ABS(N, Lo, Hi); break;
   case ISD::CTLZ_ZERO_UNDEF:
@@ -2772,6 +2774,17 @@ void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N,
   Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_PARITY(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDLoc dl(N);
+  // parity(HiLo) -> parity(Lo^Hi)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  Lo =
+      DAG.getNode(ISD::PARITY, dl, NVT, DAG.getNode(ISD::XOR, dl, NVT, Lo, Hi));
+  Hi = DAG.getConstant(0, dl, NVT);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 34c56367275..86f4fcc023d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -311,7 +311,7 @@ private:
   SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
   SDValue PromoteIntRes_Constant(SDNode *N);
   SDValue PromoteIntRes_CTLZ(SDNode *N);
-  SDValue PromoteIntRes_CTPOP(SDNode *N);
+  SDValue PromoteIntRes_CTPOP_PARITY(SDNode *N);
   SDValue PromoteIntRes_CTTZ(SDNode *N);
   SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
@@ -431,6 +431,7 @@ private:
   void ExpandIntRes_ADDSUBCARRY       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_BITREVERSE        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_PARITY            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index fcd09b61416..f854a4f4d35 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -412,6 +412,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::CTTZ_ZERO_UNDEF:            return "cttz_zero_undef";
   case ISD::CTLZ:                       return "ctlz";
   case ISD::CTLZ_ZERO_UNDEF:            return "ctlz_zero_undef";
+  case ISD::PARITY:                     return "parity";
 
   // Trampolines
   case ISD::INIT_TRAMPOLINE:            return "init_trampoline";
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 958bb793904..7ef37db68a2 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -692,6 +692,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 
     setOperationAction(ISD::BITREVERSE, VT, Expand);
+    setOperationAction(ISD::PARITY, VT, Expand);
 
     // These library functions default to expand.
     setOperationAction(ISD::FROUND, VT, Expand);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 8913dff47df..5f7721267db 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -385,6 +385,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 
+  setOperationAction(ISD::PARITY, MVT::i8, Custom);
   if (Subtarget.hasPOPCNT()) {
     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
   } else {
@@ -395,6 +396,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
     else
       setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
+
+    setOperationAction(ISD::PARITY, MVT::i16, Custom);
+    setOperationAction(ISD::PARITY, MVT::i32, Custom);
+    if (Subtarget.is64Bit())
+      setOperationAction(ISD::PARITY, MVT::i64, Custom);
   }
 
   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
@@ -28865,6 +28871,58 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
 }
 
+static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
+                           SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue X = Op.getOperand(0);
+  MVT VT = Op.getSimpleValueType();
+
+  // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
+  if (VT == MVT::i8 ||
+      DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
+    X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+    SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
+                                DAG.getConstant(0, DL, MVT::i8));
+    // Copy the inverse of the parity flag into a register with setcc.
+    SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+    // Extend to the original type.
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+  }
+
+  if (VT == MVT::i64) {
+    // Xor the high and low 16-bits together using a 32-bit operation.
+    SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
+                             DAG.getNode(ISD::SRL, DL, MVT::i64, X,
+                                         DAG.getConstant(32, DL, MVT::i8)));
+    SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+    X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
+  }
+
+  if (VT != MVT::i16) {
+    // Xor the high and low 16-bits together using a 32-bit operation.
+    SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
+                               DAG.getConstant(16, DL, MVT::i8));
+    X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
+  } else {
+    // If the input is 16-bits, we need to extend to use an i32 shift below.
+    X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
+  }
+
+  // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
+  // This should allow an h-reg to be used to save a shift.
+  SDValue Hi = DAG.getNode(
+      ISD::TRUNCATE, DL, MVT::i8,
+      DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
+  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+  SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
+  SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
+
+  // Copy the inverse of the parity flag into a register with setcc.
+  SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+  // Extend to the original type.
+  return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+}
+
 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   unsigned NewOpc = 0;
@@ -29483,6 +29541,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG, Subtarget);
   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
+  case ISD::PARITY:             return LowerPARITY(Op, Subtarget, DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
   case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
@@ -43285,89 +43344,6 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
   return SDValue();
 }
 
-// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
-// Turn it into series of XORs and a setnp.
-static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
-                             const X86Subtarget &Subtarget) {
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  // RHS needs to be 1.
-  if (!isOneConstant(N1))
-    return SDValue();
-
-  // Popcnt may be truncated.
-  if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
-    N0 = N0.getOperand(0);
-
-  // LHS needs to be a single use CTPOP.
-  if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
-    return SDValue();
-
-  EVT VT = N0.getValueType();
-
-  // We only support 64-bit and 32-bit. 64-bit requires special handling
-  // unless the 64-bit popcnt instruction is legal.
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
-    return SDValue();
-
-  SDLoc DL(N);
-  SDValue X = N0.getOperand(0);
-
-  // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
-  if (DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
-    X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
-    SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
-                                DAG.getConstant(0, DL, MVT::i8));
-    // Copy the inverse of the parity flag into a register with setcc.
-    SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
-    // Extend or truncate to the original type.
-    return DAG.getZExtOrTrunc(Setnp, DL, N->getValueType(0));
-  }
-
-  // If this is 64-bit, its always best to xor the two 32-bit pieces together
-  // even if we have popcnt.
-  if (VT == MVT::i64) {
-    SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
-                             DAG.getNode(ISD::SRL, DL, VT, X,
-                                         DAG.getConstant(32, DL, MVT::i8)));
-    SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
-    X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
-    // Generate a 32-bit parity idiom. This will bring us back here if we need
-    // to expand it too.
-    SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
-                                 DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
-                                 DAG.getConstant(1, DL, MVT::i32));
-    return DAG.getZExtOrTrunc(Parity, DL, N->getValueType(0));
-  }
-  assert(VT == MVT::i32 && "Unexpected VT!");
-
-  // Xor the high and low 16-bits together using a 32-bit operation.
-  SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
-                             DAG.getConstant(16, DL, MVT::i8));
-  X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
-
-  // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
-  // This should allow an h-reg to be used to save a shift.
-  // FIXME: We only get an h-reg in 32-bit mode.
-  SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
-                           DAG.getNode(ISD::SRL, DL, VT, X,
-                                       DAG.getConstant(8, DL, MVT::i8)));
-  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
-  SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
-  SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
-
-  // Copy the inverse of the parity flag into a register with setcc.
-  SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
-  // Extend or truncate to the original type.
-  return DAG.getZExtOrTrunc(Setnp, DL, N->getValueType(0));
-}
-
-
 // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
 // Where C is a mask containing the same number of bits as the setcc and
 // where the setcc will freely 0 upper bits of k-register. We can replace the
@@ -43459,10 +43435,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // This must be done before legalization has expanded the ctpop.
-  if (SDValue V = combineParity(N, DAG, Subtarget))
-    return V;
-
   // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
   // TODO: Support multiple SrcOps.
   if (VT == MVT::i1) {
diff --git a/test/CodeGen/AArch64/parity.ll b/test/CodeGen/AArch64/parity.ll
new file mode 100644
index 00000000000..bdddb6f1069
--- /dev/null
+++ b/test/CodeGen/AArch64/parity.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+
+define i4 @parity_4(i4 %x) {
+; CHECK-LABEL: parity_4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xf
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i4 @llvm.ctpop.i4(i4 %x)
+  %2 = and i4 %1, 1
+  ret i4 %2
+}
+
+define i8 @parity_8(i8 %x) {
+; CHECK-LABEL: parity_8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i8 @llvm.ctpop.i8(i8 %x)
+  %2 = and i8 %1, 1
+  ret i8 %2
+}
+
+define i16 @parity_16(i16 %x) {
+; CHECK-LABEL: parity_16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i16 @llvm.ctpop.i16(i16 %x)
+  %2 = and i16 %1, 1
+  ret i16 %2
+}
+
+define i17 @parity_17(i17 %x) {
+; CHECK-LABEL: parity_17:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0x1ffff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #16
+; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i17 @llvm.ctpop.i17(i17 %x)
+  %2 = and i17 %1, 1
+  ret i17 %2
+}
+
+define i32 @parity_32(i32 %x) {
+; CHECK-LABEL: parity_32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor w8, w0, w0, lsr #16
+; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
+  %2 = and i32 %1, 1
+  ret i32 %2
+}
+
+define i64 @parity_64(i64 %x) {
+; CHECK-LABEL: parity_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x0, x0, lsr #32
+; CHECK-NEXT:    eor x8, x8, x8, lsr #16
+; CHECK-NEXT:    eor x8, x8, x8, lsr #8
+; CHECK-NEXT:    eor x8, x8, x8, lsr #4
+; CHECK-NEXT:    eor x8, x8, x8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and x0, x8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
+  %2 = and i64 %1, 1
+  ret i64 %2
+}
+
+define i32 @parity_64_trunc(i64 %x) {
+; CHECK-LABEL: parity_64_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x0, x0, lsr #32
+; CHECK-NEXT:    eor x8, x8, x8, lsr #16
+; CHECK-NEXT:    eor x8, x8, x8, lsr #8
+; CHECK-NEXT:    eor x8, x8, x8, lsr #4
+; CHECK-NEXT:    eor x8, x8, x8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
+  %2 = trunc i64 %1 to i32
+  %3 = and i32 %2, 1
+  ret i32 %3
+}
+
+define i8 @parity_32_trunc(i32 %x) {
+; CHECK-LABEL: parity_32_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor w8, w0, w0, lsr #16
+; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
+  %2 = trunc i32 %1 to i8
+  %3 = and i8 %2, 1
+  ret i8 %3
+}
+
+define i32 @parity_8_zext(i8 %x) {
+; CHECK-LABEL: parity_8_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %a = zext i8 %x to i32
+  %b = tail call i32 @llvm.ctpop.i32(i32 %a)
+  %c = and i32 %b, 1
+  ret i32 %c
+}
+
+define i32 @parity_8_mask(i32 %x) {
+; CHECK-LABEL: parity_8_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %a = and i32 %x, 255
+  %b = tail call i32 @llvm.ctpop.i32(i32 %a)
+  %c = and i32 %b, 1
+  ret i32 %c
+}
+
+declare i4 @llvm.ctpop.i4(i4 %x)
+declare i8 @llvm.ctpop.i8(i8 %x)
+declare i16 @llvm.ctpop.i16(i16 %x)
+declare i17 @llvm.ctpop.i17(i17 %x)
+declare i32 @llvm.ctpop.i32(i32 %x)
+declare i64 @llvm.ctpop.i64(i64 %x)
diff --git a/test/CodeGen/ARM/parity.ll b/test/CodeGen/ARM/parity.ll
new file mode 100644
index 00000000000..40c0d7bd32f
--- /dev/null
+++ b/test/CodeGen/ARM/parity.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple arm-eabi -mattr=+v6t2 | FileCheck %s
+
+define i4 @parity_4(i4 %x) {
+; CHECK-LABEL: parity_4:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i4 @llvm.ctpop.i4(i4 %x)
+  %2 = and i4 %1, 1
+  ret i4 %2
+}
+
+define i8 @parity_8(i8 %x) {
+; CHECK-LABEL: parity_8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i8 @llvm.ctpop.i8(i8 %x)
+  %2 = and i8 %1, 1
+  ret i8 %2
+}
+
+define i16 @parity_16(i16 %x) {
+; CHECK-LABEL: parity_16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i16 @llvm.ctpop.i16(i16 %x)
+  %2 = and i16 %1, 1
+  ret i16 %2
+}
+
+define i17 @parity_17(i17 %x) {
+; CHECK-LABEL: parity_17:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    bfc r0, #17, #15
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i17 @llvm.ctpop.i17(i17 %x)
+  %2 = and i17 %1, 1
+  ret i17 %2
+}
+
+define i32 @parity_32(i32 %x) {
+; CHECK-LABEL: parity_32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
+  %2 = and i32 %1, 1
+  ret i32 %2
+}
+
+define i64 @parity_64(i64 %x) {
+; CHECK-LABEL: parity_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    eor r0, r0, r1
+; CHECK-NEXT:    mov r1, #0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
+  %2 = and i64 %1, 1
+  ret i64 %2
+}
+
+define i32 @parity_64_trunc(i64 %x) {
+; CHECK-LABEL: parity_64_trunc:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    eor r0, r0, r1
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
+  %2 = trunc i64 %1 to i32
+  %3 = and i32 %2, 1
+  ret i32 %3
+}
+
+define i8 @parity_32_trunc(i32 %x) {
+; CHECK-LABEL: parity_32_trunc:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
+  %2 = trunc i32 %1 to i8
+  %3 = and i8 %2, 1
+  ret i8 %3
+}
+
+define i32 @parity_8_zext(i8 %x) {
+; CHECK-LABEL: parity_8_zext:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %a = zext i8 %x to i32
+  %b = tail call i32 @llvm.ctpop.i32(i32 %a)
+  %c = and i32 %b, 1
+  ret i32 %c
+}
+
+define i32 @parity_8_mask(i32 %x) {
+; CHECK-LABEL: parity_8_mask:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %a = and i32 %x, 255
+  %b = tail call i32 @llvm.ctpop.i32(i32 %a)
+  %c = and i32 %b, 1
+  ret i32 %c
+}
+
+declare i4 @llvm.ctpop.i4(i4 %x)
+declare i8 @llvm.ctpop.i8(i8 %x)
+declare i16 @llvm.ctpop.i16(i16 %x)
+declare i17 @llvm.ctpop.i17(i17 %x)
+declare i32 @llvm.ctpop.i32(i32 %x)
+declare i64 @llvm.ctpop.i64(i64 %x)
diff --git a/test/CodeGen/X86/parity.ll b/test/CodeGen/X86/parity.ll
index 6289ab48242..d7344a4a2ed 100644
--- a/test/CodeGen/X86/parity.ll
+++ b/test/CodeGen/X86/parity.ll
@@ -4,6 +4,187 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X86-POPCNT
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT
 
+define i4 @parity_4(i4 %x) {
+; X86-NOPOPCNT-LABEL: parity_4:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    testb $15, {{[0-9]+}}(%esp)
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_4:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    testb $15, %dil
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_4:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    testb $15, {{[0-9]+}}(%esp)
+; X86-POPCNT-NEXT:    setnp %al
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_4:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    testb $15, %dil
+; X64-POPCNT-NEXT:    setnp %al
+; X64-POPCNT-NEXT:    retq
+  %1 = tail call i4 @llvm.ctpop.i4(i4 %x)
+  %2 = and i4 %1, 1
+  ret i4 %2
+}
+
+define i8 @parity_8(i8 %x) {
+; X86-NOPOPCNT-LABEL: parity_8:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_8:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    testb %dil, %dil
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_8:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-POPCNT-NEXT:    setnp %al
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_8:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    testb %dil, %dil
+; X64-POPCNT-NEXT:    setnp %al
+; X64-POPCNT-NEXT:    retq
+  %1 = tail call i8 @llvm.ctpop.i8(i8 %x)
+  %2 = and i8 %1, 1
+  ret i8 %2
+}
+
+define i16 @parity_16(i16 %x) {
+; X86-NOPOPCNT-LABEL: parity_16:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X86-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_16:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    movl %edi, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_16:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    popcntw {{[0-9]+}}(%esp), %ax
+; X86-POPCNT-NEXT:    andl $1, %eax
+; X86-POPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_16:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    popcntw %di, %ax
+; X64-POPCNT-NEXT:    andl $1, %eax
+; X64-POPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-POPCNT-NEXT:    retq
+  %1 = tail call i16 @llvm.ctpop.i16(i16 %x)
+  %2 = and i16 %1, 1
+  ret i16 %2
+}
+
+define i16 @parity_16_load(i16* %x) {
+; X86-NOPOPCNT-LABEL: parity_16_load:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOPOPCNT-NEXT:    movzwl (%eax), %ecx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X86-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_16_load:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    movzwl (%rdi), %ecx
+; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_16_load:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-POPCNT-NEXT:    popcntw (%eax), %ax
+; X86-POPCNT-NEXT:    andl $1, %eax
+; X86-POPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_16_load:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    popcntw (%rdi), %ax
+; X64-POPCNT-NEXT:    andl $1, %eax
+; X64-POPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-POPCNT-NEXT:    retq
+  %1 = load i16, i16* %x
+  %2 = tail call i16 @llvm.ctpop.i16(i16 %1)
+  %3 = and i16 %2, 1
+  ret i16 %3
+}
+
+define i17 @parity_17(i17 %x) {
+; X86-NOPOPCNT-LABEL: parity_17:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOPOPCNT-NEXT:    movl %ecx, %eax
+; X86-NOPOPCNT-NEXT:    andl $131071, %eax # imm = 0x1FFFF
+; X86-NOPOPCNT-NEXT:    movl %eax, %edx
+; X86-NOPOPCNT-NEXT:    shrl $16, %edx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %edx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X86-NOPOPCNT-NEXT:    xorb %dl, %ch
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_17:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    movl %edi, %eax
+; X64-NOPOPCNT-NEXT:    andl $131071, %eax # imm = 0x1FFFF
+; X64-NOPOPCNT-NEXT:    movl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    shrl $16, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    shrl $8, %edi
+; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X64-NOPOPCNT-NEXT:    xorb %cl, %dil
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_17:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    movl $131071, %eax # imm = 0x1FFFF
+; X86-POPCNT-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-POPCNT-NEXT:    popcntl %eax, %eax
+; X86-POPCNT-NEXT:    andl $1, %eax
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_17:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    andl $131071, %edi # imm = 0x1FFFF
+; X64-POPCNT-NEXT:    popcntl %edi, %eax
+; X64-POPCNT-NEXT:    andl $1, %eax
+; X64-POPCNT-NEXT:    retq
+  %1 = tail call i17 @llvm.ctpop.i17(i17 %x)
+  %2 = and i17 %1, 1
+  ret i17 %2
+}
+
 define i32 @parity_32(i32 %x) {
 ; X86-NOPOPCNT-LABEL: parity_32:
 ; X86-NOPOPCNT:       # %bb.0:
@@ -157,14 +338,14 @@ define i8 @parity_32_trunc(i32 %x) {
 ; X86-POPCNT-LABEL: parity_32_trunc:
 ; X86-POPCNT:       # %bb.0:
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    andb $1, %al
+; X86-POPCNT-NEXT:    andl $1, %eax
 ; X86-POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-POPCNT-NEXT:    retl
 ;
 ; X64-POPCNT-LABEL: parity_32_trunc:
 ; X64-POPCNT:       # %bb.0:
 ; X64-POPCNT-NEXT:    popcntl %edi, %eax
-; X64-POPCNT-NEXT:    andb $1, %al
+; X64-POPCNT-NEXT:    andl $1, %eax
 ; X64-POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-POPCNT-NEXT:    retq
   %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -241,5 +422,9 @@ define i32 @parity_8_mask(i32 %x) {
   ret i32 %c
 }
 
+declare i4 @llvm.ctpop.i4(i4 %x)
+declare i8 @llvm.ctpop.i8(i8 %x)
+declare i16 @llvm.ctpop.i16(i16 %x)
+declare i17 @llvm.ctpop.i17(i17 %x)
 declare i32 @llvm.ctpop.i32(i32 %x)
 declare i64 @llvm.ctpop.i64(i64 %x)
diff --git a/test/CodeGen/X86/vector-reduce-xor-bool.ll b/test/CodeGen/X86/vector-reduce-xor-bool.ll
index fb019ffd99e..06a428c514a 100644
--- a/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -53,7 +53,7 @@ define i1 @trunc_v2i64_v2i1(<2 x i64>) {
 ; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $3, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <2 x i64> %0 to <2 x i1>
@@ -103,7 +103,7 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) {
 ; AVX512VL-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vptestmd %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $15, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i32> %0 to <4 x i1>
@@ -251,7 +251,7 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) {
 ; AVX512VL-NEXT:    vpsllq $63, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vptestmq %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $15, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -974,7 +974,7 @@ define i1 @icmp_v2i64_v2i1(<2 x i64>) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $3, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <2 x i64> %0, zeroinitializer
@@ -1025,7 +1025,7 @@ define i1 @icmp_v4i32_v4i1(<4 x i32>) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $15, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i32> %0, zeroinitializer
@@ -1214,7 +1214,7 @@ define i1 @icmp_v4i64_v4i1(<4 x i64>) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vptestnmq %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $15, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq