[MIPS GlobalISel] Select population count (popcount)

G_CTPOP is generated from llvm.ctpop.<type> intrinsics, clang generates these intrinsics from __builtin_popcount and __builtin_popcountll. Add lower and narrow scalar for G_CTPOP. Lower G_CTPOP for MIPS32. Differential Revision: https://reviews.llvm.org/D73216
2024-10-18 18:42:46 +02:00 · 2020-01-27 09:59:50 +01:00 · 2020-01-27 09:59:50 +01:00 · 75e76863f0
commit 75e76863f0
parent 4b729fba3d
5 changed files with 263 additions and 0 deletions
--- a/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@ -243,6 +243,7 @@ public:
  LegalizeResult narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
  LegalizeResult narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
  LegalizeResult narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
+  LegalizeResult narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, LLT Ty);

  LegalizeResult lowerBitcast(MachineInstr &MI);
  LegalizeResult lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@ -983,6 +983,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
        return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
      case TargetOpcode::G_CTTZ:
        return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
+      case TargetOpcode::G_CTPOP:
+        return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
      default:
        return UnableToLegalize;
      }
@ -3918,6 +3920,30 @@ LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
  return UnableToLegalize;
 }

+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
+                                   LLT NarrowTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+  unsigned NarrowSize = NarrowTy.getSizeInBits();
+
+  if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
+    auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
+
+    auto LoCTPOP = MIRBuilder.buildCTPOP(NarrowTy, UnmergeSrc.getReg(0));
+    auto HiCTPOP = MIRBuilder.buildCTPOP(NarrowTy, UnmergeSrc.getReg(1));
+    auto Out = MIRBuilder.buildAdd(NarrowTy, HiCTPOP, LoCTPOP);
+    MIRBuilder.buildZExt(MI.getOperand(0), Out);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
  unsigned Opc = MI.getOpcode();
@ -4017,6 +4043,57 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
    MI.getOperand(1).setReg(MIBTmp.getReg(0));
    return Legalized;
  }
+  case TargetOpcode::G_CTPOP: {
+    unsigned Size = Ty.getSizeInBits();
+    MachineIRBuilder &B = MIRBuilder;
+
+    // Count set bits in blocks of 2 bits. Default approach would be
+    // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
+    // We use following formula instead:
+    // B2Count = val - { (val >> 1) & 0x55555555 }
+    // since it gives same result in blocks of 2 with one instruction less.
+    auto C_1 = B.buildConstant(Ty, 1);
+    auto B2Set1LoTo1Hi = B.buildLShr(Ty, MI.getOperand(1).getReg(), C_1);
+    APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
+    auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
+    auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
+    auto B2Count = B.buildSub(Ty, MI.getOperand(1).getReg(), B2Count1Hi);
+
+    // In order to get count in blocks of 4 add values from adjacent block of 2.
+    // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
+    auto C_2 = B.buildConstant(Ty, 2);
+    auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
+    APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
+    auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
+    auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
+    auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
+    auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
+
+    // For count in blocks of 8 bits we don't have to mask high 4 bits before
+    // addition since count value sits in range {0,...,8} and 4 bits are enough
+    // to hold such binary values. After addition high 4 bits still hold count
+    // of set bits in high 4 bit block, set them to zero and get 8 bit result.
+    // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
+    auto C_4 = B.buildConstant(Ty, 4);
+    auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
+    auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
+    APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
+    auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
+    auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
+
+    assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
+    // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
+    // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
+    auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
+    auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
+
+    // Shift count result from 8 high bits to low bits.
+    auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
+    B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
  }
 }

--- a/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/lib/Target/Mips/MipsLegalizerInfo.cpp
@ -214,6 +214,10 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
  getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
      .lowerFor({{s32, s32}, {s64, s64}});

+  getActionDefinitionsBuilder(G_CTPOP)
+      .lowerFor({{s32, s32}})
+      .clampScalar(1, s32, s32);
+
  // FP instructions
  getActionDefinitionsBuilder(G_FCONSTANT)
      .legalFor({s32, s64});
--- a/test/CodeGen/Mips/GlobalISel/legalizer/ctpop.mir
+++ b/test/CodeGen/Mips/GlobalISel/legalizer/ctpop.mir
@ -0,0 +1,102 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=mipsel-linux-gnu -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=MIPS32
+---
+name:            ctpop_i32
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0
+
+    ; MIPS32-LABEL: name: ctpop_i32
+    ; MIPS32: liveins: $a0
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1431655765
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
+    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[AND]]
+    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; MIPS32: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C2]](s32)
+    ; MIPS32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 858993459
+    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C3]]
+    ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C3]]
+    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND1]], [[AND2]]
+    ; MIPS32: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; MIPS32: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C4]](s32)
+    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]]
+    ; MIPS32: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135
+    ; MIPS32: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]]
+    ; MIPS32: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009
+    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]]
+    ; MIPS32: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s32)
+    ; MIPS32: $v0 = COPY [[LSHR3]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s32) = COPY $a0
+    %1:_(s32) = G_CTPOP %0(s32)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            ctpop_i64
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: ctpop_i64
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1431655765
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
+    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[AND]]
+    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; MIPS32: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C2]](s32)
+    ; MIPS32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 858993459
+    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C3]]
+    ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C3]]
+    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND1]], [[AND2]]
+    ; MIPS32: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; MIPS32: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C4]](s32)
+    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]]
+    ; MIPS32: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135
+    ; MIPS32: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]]
+    ; MIPS32: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009
+    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]]
+    ; MIPS32: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s32)
+    ; MIPS32: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32)
+    ; MIPS32: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
+    ; MIPS32: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[AND4]]
+    ; MIPS32: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[SUB1]], [[C2]](s32)
+    ; MIPS32: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C3]]
+    ; MIPS32: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C3]]
+    ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND5]], [[AND6]]
+    ; MIPS32: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[ADD2]], [[C4]](s32)
+    ; MIPS32: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR6]], [[ADD2]]
+    ; MIPS32: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ADD3]], [[C5]]
+    ; MIPS32: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND7]], [[C6]]
+    ; MIPS32: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[MUL1]], [[C7]](s32)
+    ; MIPS32: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[LSHR7]], [[LSHR3]]
+    ; MIPS32: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ADD4]](s32), [[C8]](s32)
+    ; MIPS32: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
+    ; MIPS32: $v0 = COPY [[UV]](s32)
+    ; MIPS32: $v1 = COPY [[UV1]](s32)
+    ; MIPS32: RetRA implicit $v0, implicit $v1
+    %1:_(s32) = COPY $a0
+    %2:_(s32) = COPY $a1
+    %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+    %3:_(s64) = G_CTPOP %0(s64)
+    %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3(s64)
+    $v0 = COPY %4(s32)
+    $v1 = COPY %5(s32)
+    RetRA implicit $v0, implicit $v1
+
+...
--- a/test/CodeGen/Mips/GlobalISel/llvm-ir/ctpop.ll
+++ b/test/CodeGen/Mips/GlobalISel/llvm-ir/ctpop.ll
@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=mipsel-linux-gnu -global-isel -verify-machineinstrs %s -o -| FileCheck %s -check-prefixes=MIPS32
+
+define i32 @ctpop_i32(i32 %a) {
+; MIPS32-LABEL: ctpop_i32:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    srl $1, $4, 1
+; MIPS32-NEXT:    lui $2, 21845
+; MIPS32-NEXT:    ori $2, $2, 21845
+; MIPS32-NEXT:    and $1, $1, $2
+; MIPS32-NEXT:    subu $1, $4, $1
+; MIPS32-NEXT:    srl $2, $1, 2
+; MIPS32-NEXT:    lui $3, 13107
+; MIPS32-NEXT:    ori $3, $3, 13107
+; MIPS32-NEXT:    and $2, $2, $3
+; MIPS32-NEXT:    and $1, $1, $3
+; MIPS32-NEXT:    addu $1, $2, $1
+; MIPS32-NEXT:    srl $2, $1, 4
+; MIPS32-NEXT:    addu $1, $2, $1
+; MIPS32-NEXT:    lui $2, 3855
+; MIPS32-NEXT:    ori $2, $2, 3855
+; MIPS32-NEXT:    and $1, $1, $2
+; MIPS32-NEXT:    lui $2, 257
+; MIPS32-NEXT:    ori $2, $2, 257
+; MIPS32-NEXT:    mul $1, $1, $2
+; MIPS32-NEXT:    srl $2, $1, 24
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %0 = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %0
+}
+declare i32 @llvm.ctpop.i32(i32)
+
+
+define i64 @ctpop_i64(i64 %a) {
+; MIPS32-LABEL: ctpop_i64:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    srl $1, $4, 1
+; MIPS32-NEXT:    lui $2, 21845
+; MIPS32-NEXT:    ori $2, $2, 21845
+; MIPS32-NEXT:    and $1, $1, $2
+; MIPS32-NEXT:    subu $1, $4, $1
+; MIPS32-NEXT:    srl $3, $1, 2
+; MIPS32-NEXT:    lui $4, 13107
+; MIPS32-NEXT:    ori $4, $4, 13107
+; MIPS32-NEXT:    and $3, $3, $4
+; MIPS32-NEXT:    and $1, $1, $4
+; MIPS32-NEXT:    addu $1, $3, $1
+; MIPS32-NEXT:    srl $3, $1, 4
+; MIPS32-NEXT:    addu $1, $3, $1
+; MIPS32-NEXT:    lui $3, 3855
+; MIPS32-NEXT:    ori $3, $3, 3855
+; MIPS32-NEXT:    and $1, $1, $3
+; MIPS32-NEXT:    lui $6, 257
+; MIPS32-NEXT:    ori $6, $6, 257
+; MIPS32-NEXT:    mul $1, $1, $6
+; MIPS32-NEXT:    srl $1, $1, 24
+; MIPS32-NEXT:    srl $7, $5, 1
+; MIPS32-NEXT:    and $2, $7, $2
+; MIPS32-NEXT:    subu $2, $5, $2
+; MIPS32-NEXT:    srl $5, $2, 2
+; MIPS32-NEXT:    and $5, $5, $4
+; MIPS32-NEXT:    and $2, $2, $4
+; MIPS32-NEXT:    addu $2, $5, $2
+; MIPS32-NEXT:    srl $4, $2, 4
+; MIPS32-NEXT:    addu $2, $4, $2
+; MIPS32-NEXT:    and $2, $2, $3
+; MIPS32-NEXT:    mul $2, $2, $6
+; MIPS32-NEXT:    srl $2, $2, 24
+; MIPS32-NEXT:    addu $2, $2, $1
+; MIPS32-NEXT:    ori $3, $zero, 0
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %0 = call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %0
+}
+declare i64 @llvm.ctpop.i64(i64)