From 18108c50ecdd67f15b3ab6d9162d6443124c8f59 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Wed, 21 Jul 2021 13:04:06 -0700 Subject: [PATCH] [AArch64][GlobalISel] Legalize ctpop s128 Differential revision: https://reviews.llvm.org/D106494 --- lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 10 +++- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 7 +-- .../AArch64/GlobalISel/legalize-ctpop.mir | 24 ++++++++ test/CodeGen/AArch64/popcount.ll | 58 +++++++++++-------- 4 files changed, 69 insertions(+), 30 deletions(-) diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index c1e0d2549c4..50f0afbf6af 100644 --- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -5622,7 +5622,15 @@ LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0)); auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1)); - MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP); + + LLT CountTy = LLT::scalar(Log2_64_Ceil(SrcTy.getSizeInBits())); + if (CountTy.getSizeInBits() < DstTy.getSizeInBits()) { + LoCTPOP = MIRBuilder.buildTrunc(CountTy, LoCTPOP); + HiCTPOP = MIRBuilder.buildTrunc(CountTy, HiCTPOP); + auto Add = MIRBuilder.buildAdd(CountTy, HiCTPOP, LoCTPOP); + MIRBuilder.buildZExt(DstReg, Add); + } else + MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP); MI.eraseFromParent(); return Legalized; diff --git a/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 08e4a119127..f7764676708 100644 --- a/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -764,7 +764,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_SBFX, G_UBFX}) .customFor({{s32, s32}, {s64, s64}}); - // TODO: Custom legalization for s128 // TODO: Use generic lowering when custom lowering is not possible. auto always = [=](const LegalityQuery &Q) { return true; }; getActionDefinitionsBuilder(G_CTPOP) @@ -775,6 +774,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .maxScalarEltSameAsIf(always, 1, 0) .customFor({{s32, s32}, {s64, s64}, + {s128, s128}, {v2s64, v2s64}, {v2s32, v2s32}, {v4s32, v4s32}, @@ -1151,8 +1151,7 @@ bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI, // v8s16,v4s32,v2s64 -> v16i8 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8); if (Ty.isScalar()) { - // TODO: Handle s128. - assert((Size == 32 || Size == 64) && "Expected only 32 or 64 bit scalars!"); + assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!"); if (Size == 32) { Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); } @@ -1198,7 +1197,7 @@ bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI, } // Post-conditioning. - if (Ty.isScalar() && Size == 64) + if (Ty.isScalar() && (Size == 64 || Size == 128)) MIRBuilder.buildZExt(Dst, UADD); else UADD->getOperand(0).setReg(Dst); diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir b/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir index 04406c15296..4748314f69f 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir @@ -78,6 +78,30 @@ body: | $x0 = COPY %ctpop(s64) RET_ReallyLR implicit $x0 +... +--- +name: s128_lower +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0 + ; CHECK-LABEL: name: s128_lower + ; CHECK: liveins: $q0 + ; CHECK: %copy:_(s128) = COPY $q0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST %copy(s128) + ; CHECK: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<16 x s8>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[INT]](s32), [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: %ctpop:_(s128) = G_MERGE_VALUES [[MV]](s64), [[C1]](s64) + ; CHECK: $q0 = COPY %ctpop(s128) + ; CHECK: RET_ReallyLR implicit $q0 + %copy:_(s128) = COPY $q0 + %ctpop:_(s128) = G_CTPOP %copy(s128) + $q0 = COPY %ctpop(s128) + RET_ReallyLR implicit $q0 + ... --- name: widen_s16 diff --git a/test/CodeGen/AArch64/popcount.ll b/test/CodeGen/AArch64/popcount.ll index 2e5e988f057..1681b86cdc2 100644 --- a/test/CodeGen/AArch64/popcount.ll +++ b/test/CodeGen/AArch64/popcount.ll @@ -5,15 +5,12 @@ define i8 @popcount128(i128* nocapture nonnull readonly %0) { ; CHECK-LABEL: popcount128: ; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: ldr x8, [x0, #8] -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: // implicit-def: $q0 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: uaddlv h1, v0.16b ; CHECK-NEXT: // implicit-def: $q0 ; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret Entry: @@ -30,27 +27,35 @@ declare i128 @llvm.ctpop.i128(i128) define i16 @popcount256(i256* nocapture nonnull readonly %0) { ; CHECK-LABEL: popcount256: ; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: ldr x8, [x0, #8] -; CHECK-NEXT: ldr x9, [x0, #24] -; CHECK-NEXT: ldr d1, [x0, #16] +; CHECK-NEXT: ldr x11, [x0] +; CHECK-NEXT: ldr x10, [x0, #8] +; CHECK-NEXT: ldr x9, [x0, #16] +; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: // implicit-def: $q0 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.d[1], x9 +; CHECK-NEXT: mov v0.d[0], x11 +; CHECK-NEXT: mov v0.d[1], x10 +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: mov v1.d[0], x9 +; CHECK-NEXT: mov v1.d[1], x8 ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlv h1, v0.16b +; CHECK-NEXT: uaddlv h2, v0.16b ; CHECK-NEXT: // implicit-def: $q0 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: cnt v1.16b, v1.16b +; CHECK-NEXT: uaddlv h2, v1.16b +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: mov v1.16b, v2.16b +; CHECK-NEXT: // kill: def $s1 killed $s1 killed $q1 +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: // implicit-def: $q0 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.d[1], x8 -; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlv h1, v0.16b -; CHECK-NEXT: // implicit-def: $q0 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: // implicit-def: $w9 +; CHECK-NEXT: // kill: def $x8 killed $w8 +; CHECK-NEXT: // kill: def $x9 killed $w9 +; CHECK-NEXT: bfi x8, x9, #32, #32 +; CHECK-NEXT: and x8, x8, #0xff +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret Entry: %1 = load i256, i256* %0, align 16 @@ -66,16 +71,19 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) { ; CHECK-LABEL: popcount1x128: ; CHECK: // %bb.0: // %Entry ; CHECK-NEXT: // implicit-def: $q0 -; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: mov v0.d[0], x0 ; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: uaddlv h1, v0.16b ; CHECK-NEXT: // implicit-def: $q0 ; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: // kill: def $x0 killed $w0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov x1, v0.d[1] +; CHECK-NEXT: // kill: def $x8 killed $w8 +; CHECK-NEXT: bfi x0, x8, #32, #32 +; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: ret Entry: %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)