[SelectionDAG][X86][ARM][AArch64] Add ISD opcode for __builtin_parity. Expand it to shifts and xors.

Clang emits (and (ctpop X), 1) for __builtin_parity. If ctpop isn't natively supported by the target, this leads to poor codegen due to the expansion of ctpop being more complex than what is needed for parity. This adds a DAG combine to convert the pattern to ISD::PARITY before operation legalization. Type legalization is updated to handled Expanding and Promoting this operation. If after type legalization, CTPOP is supported for this type, LegalizeDAG will turn it back into CTPOP+AND. Otherwise LegalizeDAG will emit a series of shifts and xors followed by an AND with 1. I've avoided vectors in this patch to avoid more legalization complexity for this patch. X86 previously had a custom DAG combiner for this. This is now moved to Custom lowering for the new opcode. There is a minor regression in vector-reduce-xor-bool.ll, but a follow up patch can easily fix that. Fixes PR47433 Reviewed By: efriedma Differential Revision: https://reviews.llvm.org/D87209
2025-01-31 12:41:49 +01:00 · 2020-09-12 11:42:18 -07:00 · 2020-09-12 11:42:18 -07:00 · c6a7e261b5
commit c6a7e261b5
parent 99ce5bbc34
12 changed files with 642 additions and 100 deletions
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@ -598,6 +598,7 @@ enum NodeType {
  CTLZ,
  CTPOP,
  BITREVERSE,
+  PARITY,

  /// Bit counting operators with an undefined result for zero inputs.
  CTTZ_ZERO_UNDEF,
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -5574,6 +5574,25 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
    if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
      return V;

+  // fold (and (ctpop X), 1) -> parity X
+  // Only do this before op legalization as it might be turned back into ctpop.
+  // TODO: Support vectors?
+  if (!LegalOperations && isOneConstant(N1) && N0.hasOneUse()) {
+    SDValue Tmp = N0;
+
+    // It's possible the ctpop has been truncated, but since we only care about
+    // the LSB we can look through it.
+    if (Tmp.getOpcode() == ISD::TRUNCATE && Tmp.getOperand(0).hasOneUse())
+      Tmp = Tmp.getOperand(0);
+
+    if (Tmp.getOpcode() == ISD::CTPOP) {
+      SDLoc dl(N);
+      SDValue Parity =
+          DAG.getNode(ISD::PARITY, dl, Tmp.getValueType(), Tmp.getOperand(0));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Parity);
+    }
+  }
+
  return SDValue();
 }

--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@ -181,6 +181,7 @@ private:

  SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl);
  SDValue ExpandBSWAP(SDValue Op, const SDLoc &dl);
+  SDValue ExpandPARITY(SDValue Op, const SDLoc &dl);

  SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
  SDValue ExpandInsertToVectorThroughStack(SDValue Op);
@ -2785,6 +2786,28 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
  }
 }

+/// Open code the operations for PARITY of the specified operation.
+SDValue SelectionDAGLegalize::ExpandPARITY(SDValue Op, const SDLoc &dl) {
+  EVT VT = Op.getValueType();
+  EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  unsigned Sz = VT.getScalarSizeInBits();
+
+  // If CTPOP is legal, use it. Otherwise use shifts and xor.
+  SDValue Result;
+  if (TLI.isOperationLegal(ISD::CTPOP, VT)) {
+    Result = DAG.getNode(ISD::CTPOP, dl, VT, Op);
+  } else {
+    Result = Op;
+    for (unsigned i = Log2_32_Ceil(Sz); i != 0;) {
+      SDValue Shift = DAG.getNode(ISD::SRL, dl, VT, Result,
+                                  DAG.getConstant(1 << (--i), dl, ShVT));
+      Result = DAG.getNode(ISD::XOR, dl, VT, Result, Shift);
+    }
+  }
+
+  return DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(1, dl, VT));
+}
+
 bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
  LLVM_DEBUG(dbgs() << "Trying to expand node\n");
  SmallVector<SDValue, 8> Results;
@ -2816,6 +2839,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
  case ISD::BSWAP:
    Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
    break;
+  case ISD::PARITY:
+    Results.push_back(ExpandPARITY(Node->getOperand(0), dl));
+    break;
  case ISD::FRAMEADDR:
  case ISD::RETURNADDR:
  case ISD::FRAME_TO_ARGS_OFFSET:
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@ -62,7 +62,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
  case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
  case ISD::CTLZ_ZERO_UNDEF:
  case ISD::CTLZ:        Res = PromoteIntRes_CTLZ(N); break;
-  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP(N); break;
+  case ISD::PARITY:
+  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP_PARITY(N); break;
  case ISD::CTTZ_ZERO_UNDEF:
  case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
  case ISD::EXTRACT_VECTOR_ELT:
@ -503,10 +504,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
                      NVT));
 }

-SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) {
-  // Zero extend to the promoted type and do the count there.
+SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) {
+  // Zero extend to the promoted type and do the count or parity there.
  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-  return DAG.getNode(ISD::CTPOP, SDLoc(N), Op.getValueType(), Op);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op);
 }

 SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
@ -1980,6 +1981,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
  case ISD::AssertZext:  ExpandIntRes_AssertZext(N, Lo, Hi); break;
  case ISD::BITREVERSE:  ExpandIntRes_BITREVERSE(N, Lo, Hi); break;
  case ISD::BSWAP:       ExpandIntRes_BSWAP(N, Lo, Hi); break;
+  case ISD::PARITY:      ExpandIntRes_PARITY(N, Lo, Hi); break;
  case ISD::Constant:    ExpandIntRes_Constant(N, Lo, Hi); break;
  case ISD::ABS:         ExpandIntRes_ABS(N, Lo, Hi); break;
  case ISD::CTLZ_ZERO_UNDEF:
@ -2772,6 +2774,17 @@ void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N,
  Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi);
 }

+void DAGTypeLegalizer::ExpandIntRes_PARITY(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDLoc dl(N);
+  // parity(HiLo) -> parity(Lo^Hi)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  Lo =
+      DAG.getNode(ISD::PARITY, dl, NVT, DAG.getNode(ISD::XOR, dl, NVT, Lo, Hi));
+  Hi = DAG.getConstant(0, dl, NVT);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@ -311,7 +311,7 @@ private:
  SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
  SDValue PromoteIntRes_Constant(SDNode *N);
  SDValue PromoteIntRes_CTLZ(SDNode *N);
-  SDValue PromoteIntRes_CTPOP(SDNode *N);
+  SDValue PromoteIntRes_CTPOP_PARITY(SDNode *N);
  SDValue PromoteIntRes_CTTZ(SDNode *N);
  SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N);
  SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
@ -431,6 +431,7 @@ private:
  void ExpandIntRes_ADDSUBCARRY       (SDNode *N, SDValue &Lo, SDValue &Hi);
  void ExpandIntRes_BITREVERSE        (SDNode *N, SDValue &Lo, SDValue &Hi);
  void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_PARITY            (SDNode *N, SDValue &Lo, SDValue &Hi);
  void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
  void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
  void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@ -412,6 +412,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
  case ISD::CTTZ_ZERO_UNDEF:            return "cttz_zero_undef";
  case ISD::CTLZ:                       return "ctlz";
  case ISD::CTLZ_ZERO_UNDEF:            return "ctlz_zero_undef";
+  case ISD::PARITY:                     return "parity";

  // Trampolines
  case ISD::INIT_TRAMPOLINE:            return "init_trampoline";
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@ -692,6 +692,7 @@ void TargetLoweringBase::initActions() {
    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);

    setOperationAction(ISD::BITREVERSE, VT, Expand);
+    setOperationAction(ISD::PARITY, VT, Expand);

    // These library functions default to expand.
    setOperationAction(ISD::FROUND, VT, Expand);
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -385,6 +385,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
  setTruncStoreAction(MVT::f80, MVT::f16, Expand);
  setTruncStoreAction(MVT::f128, MVT::f16, Expand);

+  setOperationAction(ISD::PARITY, MVT::i8, Custom);
  if (Subtarget.hasPOPCNT()) {
    setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
  } else {
@ -395,6 +396,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
    else
      setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
+
+    setOperationAction(ISD::PARITY, MVT::i16, Custom);
+    setOperationAction(ISD::PARITY, MVT::i32, Custom);
+    if (Subtarget.is64Bit())
+      setOperationAction(ISD::PARITY, MVT::i64, Custom);
  }

  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
@ -28865,6 +28871,58 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
  return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
 }

+static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
+                           SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue X = Op.getOperand(0);
+  MVT VT = Op.getSimpleValueType();
+
+  // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
+  if (VT == MVT::i8 ||
+      DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
+    X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+    SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
+                                DAG.getConstant(0, DL, MVT::i8));
+    // Copy the inverse of the parity flag into a register with setcc.
+    SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+    // Extend to the original type.
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+  }
+
+  if (VT == MVT::i64) {
+    // Xor the high and low 16-bits together using a 32-bit operation.
+    SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
+                             DAG.getNode(ISD::SRL, DL, MVT::i64, X,
+                                         DAG.getConstant(32, DL, MVT::i8)));
+    SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+    X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
+  }
+
+  if (VT != MVT::i16) {
+    // Xor the high and low 16-bits together using a 32-bit operation.
+    SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
+                               DAG.getConstant(16, DL, MVT::i8));
+    X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
+  } else {
+    // If the input is 16-bits, we need to extend to use an i32 shift below.
+    X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
+  }
+
+  // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
+  // This should allow an h-reg to be used to save a shift.
+  SDValue Hi = DAG.getNode(
+      ISD::TRUNCATE, DL, MVT::i8,
+      DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
+  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+  SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
+  SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
+
+  // Copy the inverse of the parity flag into a register with setcc.
+  SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+  // Extend to the original type.
+  return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+}
+
 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
                                        const X86Subtarget &Subtarget) {
  unsigned NewOpc = 0;
@ -29483,6 +29541,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
  case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG, Subtarget);
  case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
+  case ISD::PARITY:             return LowerPARITY(Op, Subtarget, DAG);
  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
  case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
@ -43285,89 +43344,6 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
  return SDValue();
 }

-// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
-// Turn it into series of XORs and a setnp.
-static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
-                             const X86Subtarget &Subtarget) {
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  // RHS needs to be 1.
-  if (!isOneConstant(N1))
-    return SDValue();
-
-  // Popcnt may be truncated.
-  if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
-    N0 = N0.getOperand(0);
-
-  // LHS needs to be a single use CTPOP.
-  if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
-    return SDValue();
-
-  EVT VT = N0.getValueType();
-
-  // We only support 64-bit and 32-bit. 64-bit requires special handling
-  // unless the 64-bit popcnt instruction is legal.
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
-    return SDValue();
-
-  SDLoc DL(N);
-  SDValue X = N0.getOperand(0);
-
-  // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
-  if (DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
-    X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
-    SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
-                                DAG.getConstant(0, DL, MVT::i8));
-    // Copy the inverse of the parity flag into a register with setcc.
-    SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
-    // Extend or truncate to the original type.
-    return DAG.getZExtOrTrunc(Setnp, DL, N->getValueType(0));
-  }
-
-  // If this is 64-bit, its always best to xor the two 32-bit pieces together
-  // even if we have popcnt.
-  if (VT == MVT::i64) {
-    SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
-                             DAG.getNode(ISD::SRL, DL, VT, X,
-                                         DAG.getConstant(32, DL, MVT::i8)));
-    SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
-    X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
-    // Generate a 32-bit parity idiom. This will bring us back here if we need
-    // to expand it too.
-    SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
-                                 DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
-                                 DAG.getConstant(1, DL, MVT::i32));
-    return DAG.getZExtOrTrunc(Parity, DL, N->getValueType(0));
-  }
-  assert(VT == MVT::i32 && "Unexpected VT!");
-
-  // Xor the high and low 16-bits together using a 32-bit operation.
-  SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
-                             DAG.getConstant(16, DL, MVT::i8));
-  X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
-
-  // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
-  // This should allow an h-reg to be used to save a shift.
-  // FIXME: We only get an h-reg in 32-bit mode.
-  SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
-                           DAG.getNode(ISD::SRL, DL, VT, X,
-                                       DAG.getConstant(8, DL, MVT::i8)));
-  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
-  SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
-  SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
-
-  // Copy the inverse of the parity flag into a register with setcc.
-  SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
-  // Extend or truncate to the original type.
-  return DAG.getZExtOrTrunc(Setnp, DL, N->getValueType(0));
-}
-
-
 // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
 // Where C is a mask containing the same number of bits as the setcc and
 // where the setcc will freely 0 upper bits of k-register. We can replace the
@ -43459,10 +43435,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
    }
  }

-  // This must be done before legalization has expanded the ctpop.
-  if (SDValue V = combineParity(N, DAG, Subtarget))
-    return V;
-
  // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
  // TODO: Support multiple SrcOps.
  if (VT == MVT::i1) {
--- a/test/CodeGen/AArch64/parity.ll
+++ b/test/CodeGen/AArch64/parity.ll
@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+
+define i4 @parity_4(i4 %x) {
+; CHECK-LABEL: parity_4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xf
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i4 @llvm.ctpop.i4(i4 %x)
+  %2 = and i4 %1, 1
+  ret i4 %2
+}
+
+define i8 @parity_8(i8 %x) {
+; CHECK-LABEL: parity_8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i8 @llvm.ctpop.i8(i8 %x)
+  %2 = and i8 %1, 1
+  ret i8 %2
+}
+
+define i16 @parity_16(i16 %x) {
+; CHECK-LABEL: parity_16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i16 @llvm.ctpop.i16(i16 %x)
+  %2 = and i16 %1, 1
+  ret i16 %2
+}
+
+define i17 @parity_17(i17 %x) {
+; CHECK-LABEL: parity_17:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0x1ffff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #16
+; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i17 @llvm.ctpop.i17(i17 %x)
+  %2 = and i17 %1, 1
+  ret i17 %2
+}
+
+define i32 @parity_32(i32 %x) {
+; CHECK-LABEL: parity_32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor w8, w0, w0, lsr #16
+; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
+  %2 = and i32 %1, 1
+  ret i32 %2
+}
+
+define i64 @parity_64(i64 %x) {
+; CHECK-LABEL: parity_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x0, x0, lsr #32
+; CHECK-NEXT:    eor x8, x8, x8, lsr #16
+; CHECK-NEXT:    eor x8, x8, x8, lsr #8
+; CHECK-NEXT:    eor x8, x8, x8, lsr #4
+; CHECK-NEXT:    eor x8, x8, x8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and x0, x8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
+  %2 = and i64 %1, 1
+  ret i64 %2
+}
+
+define i32 @parity_64_trunc(i64 %x) {
+; CHECK-LABEL: parity_64_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x0, x0, lsr #32
+; CHECK-NEXT:    eor x8, x8, x8, lsr #16
+; CHECK-NEXT:    eor x8, x8, x8, lsr #8
+; CHECK-NEXT:    eor x8, x8, x8, lsr #4
+; CHECK-NEXT:    eor x8, x8, x8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
+  %2 = trunc i64 %1 to i32
+  %3 = and i32 %2, 1
+  ret i32 %3
+}
+
+define i8 @parity_32_trunc(i32 %x) {
+; CHECK-LABEL: parity_32_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor w8, w0, w0, lsr #16
+; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
+  %2 = trunc i32 %1 to i8
+  %3 = and i8 %2, 1
+  ret i8 %3
+}
+
+define i32 @parity_8_zext(i8 %x) {
+; CHECK-LABEL: parity_8_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %a = zext i8 %x to i32
+  %b = tail call i32 @llvm.ctpop.i32(i32 %a)
+  %c = and i32 %b, 1
+  ret i32 %c
+}
+
+define i32 @parity_8_mask(i32 %x) {
+; CHECK-LABEL: parity_8_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %a = and i32 %x, 255
+  %b = tail call i32 @llvm.ctpop.i32(i32 %a)
+  %c = and i32 %b, 1
+  ret i32 %c
+}
+
+declare i4 @llvm.ctpop.i4(i4 %x)
+declare i8 @llvm.ctpop.i8(i8 %x)
+declare i16 @llvm.ctpop.i16(i16 %x)
+declare i17 @llvm.ctpop.i17(i17 %x)
+declare i32 @llvm.ctpop.i32(i32 %x)
+declare i64 @llvm.ctpop.i64(i64 %x)
--- a/test/CodeGen/ARM/parity.ll
+++ b/test/CodeGen/ARM/parity.ll
@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple arm-eabi -mattr=+v6t2 | FileCheck %s
+
+define i4 @parity_4(i4 %x) {
+; CHECK-LABEL: parity_4:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i4 @llvm.ctpop.i4(i4 %x)
+  %2 = and i4 %1, 1
+  ret i4 %2
+}
+
+define i8 @parity_8(i8 %x) {
+; CHECK-LABEL: parity_8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i8 @llvm.ctpop.i8(i8 %x)
+  %2 = and i8 %1, 1
+  ret i8 %2
+}
+
+define i16 @parity_16(i16 %x) {
+; CHECK-LABEL: parity_16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i16 @llvm.ctpop.i16(i16 %x)
+  %2 = and i16 %1, 1
+  ret i16 %2
+}
+
+define i17 @parity_17(i17 %x) {
+; CHECK-LABEL: parity_17:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    bfc r0, #17, #15
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i17 @llvm.ctpop.i17(i17 %x)
+  %2 = and i17 %1, 1
+  ret i17 %2
+}
+
+define i32 @parity_32(i32 %x) {
+; CHECK-LABEL: parity_32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
+  %2 = and i32 %1, 1
+  ret i32 %2
+}
+
+define i64 @parity_64(i64 %x) {
+; CHECK-LABEL: parity_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    eor r0, r0, r1
+; CHECK-NEXT:    mov r1, #0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
+  %2 = and i64 %1, 1
+  ret i64 %2
+}
+
+define i32 @parity_64_trunc(i64 %x) {
+; CHECK-LABEL: parity_64_trunc:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    eor r0, r0, r1
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
+  %2 = trunc i64 %1 to i32
+  %3 = and i32 %2, 1
+  ret i32 %3
+}
+
+define i8 @parity_32_trunc(i32 %x) {
+; CHECK-LABEL: parity_32_trunc:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
+  %2 = trunc i32 %1 to i8
+  %3 = and i8 %2, 1
+  ret i8 %3
+}
+
+define i32 @parity_8_zext(i8 %x) {
+; CHECK-LABEL: parity_8_zext:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %a = zext i8 %x to i32
+  %b = tail call i32 @llvm.ctpop.i32(i32 %a)
+  %c = and i32 %b, 1
+  ret i32 %c
+}
+
+define i32 @parity_8_mask(i32 %x) {
+; CHECK-LABEL: parity_8_mask:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %a = and i32 %x, 255
+  %b = tail call i32 @llvm.ctpop.i32(i32 %a)
+  %c = and i32 %b, 1
+  ret i32 %c
+}
+
+declare i4 @llvm.ctpop.i4(i4 %x)
+declare i8 @llvm.ctpop.i8(i8 %x)
+declare i16 @llvm.ctpop.i16(i16 %x)
+declare i17 @llvm.ctpop.i17(i17 %x)
+declare i32 @llvm.ctpop.i32(i32 %x)
+declare i64 @llvm.ctpop.i64(i64 %x)
--- a/test/CodeGen/X86/parity.ll
+++ b/test/CodeGen/X86/parity.ll
@ -4,6 +4,187 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X86-POPCNT
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT

+define i4 @parity_4(i4 %x) {
+; X86-NOPOPCNT-LABEL: parity_4:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    testb $15, {{[0-9]+}}(%esp)
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_4:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    testb $15, %dil
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_4:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    testb $15, {{[0-9]+}}(%esp)
+; X86-POPCNT-NEXT:    setnp %al
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_4:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    testb $15, %dil
+; X64-POPCNT-NEXT:    setnp %al
+; X64-POPCNT-NEXT:    retq
+  %1 = tail call i4 @llvm.ctpop.i4(i4 %x)
+  %2 = and i4 %1, 1
+  ret i4 %2
+}
+
+define i8 @parity_8(i8 %x) {
+; X86-NOPOPCNT-LABEL: parity_8:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_8:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    testb %dil, %dil
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_8:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-POPCNT-NEXT:    setnp %al
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_8:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    testb %dil, %dil
+; X64-POPCNT-NEXT:    setnp %al
+; X64-POPCNT-NEXT:    retq
+  %1 = tail call i8 @llvm.ctpop.i8(i8 %x)
+  %2 = and i8 %1, 1
+  ret i8 %2
+}
+
+define i16 @parity_16(i16 %x) {
+; X86-NOPOPCNT-LABEL: parity_16:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X86-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_16:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    movl %edi, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_16:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    popcntw {{[0-9]+}}(%esp), %ax
+; X86-POPCNT-NEXT:    andl $1, %eax
+; X86-POPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_16:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    popcntw %di, %ax
+; X64-POPCNT-NEXT:    andl $1, %eax
+; X64-POPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-POPCNT-NEXT:    retq
+  %1 = tail call i16 @llvm.ctpop.i16(i16 %x)
+  %2 = and i16 %1, 1
+  ret i16 %2
+}
+
+define i16 @parity_16_load(i16* %x) {
+; X86-NOPOPCNT-LABEL: parity_16_load:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOPOPCNT-NEXT:    movzwl (%eax), %ecx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X86-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_16_load:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    movzwl (%rdi), %ecx
+; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_16_load:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-POPCNT-NEXT:    popcntw (%eax), %ax
+; X86-POPCNT-NEXT:    andl $1, %eax
+; X86-POPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_16_load:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    popcntw (%rdi), %ax
+; X64-POPCNT-NEXT:    andl $1, %eax
+; X64-POPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-POPCNT-NEXT:    retq
+  %1 = load i16, i16* %x
+  %2 = tail call i16 @llvm.ctpop.i16(i16 %1)
+  %3 = and i16 %2, 1
+  ret i16 %3
+}
+
+define i17 @parity_17(i17 %x) {
+; X86-NOPOPCNT-LABEL: parity_17:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOPOPCNT-NEXT:    movl %ecx, %eax
+; X86-NOPOPCNT-NEXT:    andl $131071, %eax # imm = 0x1FFFF
+; X86-NOPOPCNT-NEXT:    movl %eax, %edx
+; X86-NOPOPCNT-NEXT:    shrl $16, %edx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %edx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X86-NOPOPCNT-NEXT:    xorb %dl, %ch
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_17:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    movl %edi, %eax
+; X64-NOPOPCNT-NEXT:    andl $131071, %eax # imm = 0x1FFFF
+; X64-NOPOPCNT-NEXT:    movl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    shrl $16, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    shrl $8, %edi
+; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X64-NOPOPCNT-NEXT:    xorb %cl, %dil
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_17:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    movl $131071, %eax # imm = 0x1FFFF
+; X86-POPCNT-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-POPCNT-NEXT:    popcntl %eax, %eax
+; X86-POPCNT-NEXT:    andl $1, %eax
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_17:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    andl $131071, %edi # imm = 0x1FFFF
+; X64-POPCNT-NEXT:    popcntl %edi, %eax
+; X64-POPCNT-NEXT:    andl $1, %eax
+; X64-POPCNT-NEXT:    retq
+  %1 = tail call i17 @llvm.ctpop.i17(i17 %x)
+  %2 = and i17 %1, 1
+  ret i17 %2
+}
+
 define i32 @parity_32(i32 %x) {
 ; X86-NOPOPCNT-LABEL: parity_32:
 ; X86-NOPOPCNT:       # %bb.0:
@ -157,14 +338,14 @@ define i8 @parity_32_trunc(i32 %x) {
 ; X86-POPCNT-LABEL: parity_32_trunc:
 ; X86-POPCNT:       # %bb.0:
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    andb $1, %al
+; X86-POPCNT-NEXT:    andl $1, %eax
 ; X86-POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-POPCNT-NEXT:    retl
 ;
 ; X64-POPCNT-LABEL: parity_32_trunc:
 ; X64-POPCNT:       # %bb.0:
 ; X64-POPCNT-NEXT:    popcntl %edi, %eax
-; X64-POPCNT-NEXT:    andb $1, %al
+; X64-POPCNT-NEXT:    andl $1, %eax
 ; X64-POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-POPCNT-NEXT:    retq
  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
@ -241,5 +422,9 @@ define i32 @parity_8_mask(i32 %x) {
  ret i32 %c
 }

+declare i4 @llvm.ctpop.i4(i4 %x)
+declare i8 @llvm.ctpop.i8(i8 %x)
+declare i16 @llvm.ctpop.i16(i16 %x)
+declare i17 @llvm.ctpop.i17(i17 %x)
 declare i32 @llvm.ctpop.i32(i32 %x)
 declare i64 @llvm.ctpop.i64(i64 %x)
--- a/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/test/CodeGen/X86/vector-reduce-xor-bool.ll
@ -53,7 +53,7 @@ define i1 @trunc_v2i64_v2i1(<2 x i64>) {
 ; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $3, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
  %a = trunc <2 x i64> %0 to <2 x i1>
@ -103,7 +103,7 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) {
 ; AVX512VL-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vptestmd %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $15, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
  %a = trunc <4 x i32> %0 to <4 x i1>
@ -251,7 +251,7 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) {
 ; AVX512VL-NEXT:    vpsllq $63, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vptestmq %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $15, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@ -974,7 +974,7 @@ define i1 @icmp_v2i64_v2i1(<2 x i64>) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $3, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
  %a = icmp eq <2 x i64> %0, zeroinitializer
@ -1025,7 +1025,7 @@ define i1 @icmp_v4i32_v4i1(<4 x i32>) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $15, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
  %a = icmp eq <4 x i32> %0, zeroinitializer
@ -1214,7 +1214,7 @@ define i1 @icmp_v4i64_v4i1(<4 x i64>) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vptestnmq %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $15, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq