From 7e17214e5b653c65aa5d1132de2e1ff5ffe80f3e Mon Sep 17 00:00:00 2001 From: Jessica Paquette Date: Tue, 11 Feb 2020 10:22:38 -0800 Subject: [PATCH] [AArch64][GlobalISel] Properly implement widening for TB(N)Z When we have to widen to a 64-bit register, we have to emit a SUBREG_TO_REG. Add a general-purpose widening helpe which emits the correct SUBREG_TO_REG instruction based off of a desired size and add a testcase. Also remove some asserts which are technically incorrect in `emitTestBit`. - p0 doesn't count as a scalar type, so we need to check `!Ty.isVector()` instead - Whenever we have a s1, the Size/Bit checks are too conservative, so just remove them Replace these asserts with less conservative ones where applicable. Differential Revision: https://reviews.llvm.org/D74427 --- .../AArch64/AArch64InstructionSelector.cpp | 71 +++++-- .../GlobalISel/opt-fold-ext-tbz-tbnz.mir | 5 +- .../GlobalISel/widen-narrow-tbz-tbnz.mir | 193 ++++++++++++++++++ 3 files changed, 254 insertions(+), 15 deletions(-) create mode 100644 test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 83f1004c258..25aba784b19 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -273,6 +273,8 @@ private: /// new copy. Register narrowExtendRegIfNeeded(Register ExtReg, MachineIRBuilder &MIB) const; + Register widenGPRBankRegIfNeeded(Register Reg, unsigned Size, + MachineIRBuilder &MIB) const; ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, @@ -1124,26 +1126,25 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, MachineInstr *AArch64InstructionSelector::emitTestBit( Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const { - MachineRegisterInfo &MRI = *MIB.getMRI(); -#ifndef NDEBUG + assert(TestReg.isValid()); assert(ProduceNonFlagSettingCondBr && "Cannot emit TB(N)Z with speculation tracking!"); - assert(TestReg.isValid()); - LLT Ty = MRI.getType(TestReg); - unsigned Size = Ty.getSizeInBits(); - assert(Bit < Size && - "Bit to test must be smaler than the size of a test register!"); - assert(Ty.isScalar() && "Expected a scalar!"); - assert(Size >= 32 && "Expected at least a 32-bit register!"); -#endif + MachineRegisterInfo &MRI = *MIB.getMRI(); // Attempt to optimize the test bit by walking over instructions. TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI); - bool UseWReg = Bit < 32; + LLT Ty = MRI.getType(TestReg); + unsigned Size = Ty.getSizeInBits(); + assert(!Ty.isVector() && "Expected a scalar!"); + assert(Bit < 64 && "Bit is too large!"); // When the test register is a 64-bit register, we have to narrow to make // TBNZW work. - if (UseWReg) + bool UseWReg = Bit < 32; + unsigned NecessarySize = UseWReg ? 32 : 64; + if (Size < NecessarySize) + TestReg = widenGPRBankRegIfNeeded(TestReg, NecessarySize, MIB); + else if (Size > NecessarySize) TestReg = narrowExtendRegIfNeeded(TestReg, MIB); static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX}, @@ -5154,6 +5155,52 @@ Register AArch64InstructionSelector::narrowExtendRegIfNeeded( return Copy.getReg(0); } +Register AArch64InstructionSelector::widenGPRBankRegIfNeeded( + Register Reg, unsigned WideSize, MachineIRBuilder &MIB) const { + assert(WideSize >= 8 && "WideSize is smaller than all possible registers?"); + MachineRegisterInfo &MRI = *MIB.getMRI(); + unsigned NarrowSize = MRI.getType(Reg).getSizeInBits(); + assert(WideSize >= NarrowSize && + "WideSize cannot be smaller than NarrowSize!"); + + // If the sizes match, just return the register. + // + // If NarrowSize is an s1, then we can select it to any size, so we'll treat + // it as a don't care. + if (NarrowSize == WideSize || NarrowSize == 1) + return Reg; + + // Now check the register classes. + const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI); + const TargetRegisterClass *OrigRC = getMinClassForRegBank(*RB, NarrowSize); + const TargetRegisterClass *WideRC = getMinClassForRegBank(*RB, WideSize); + assert(OrigRC && "Could not determine narrow RC?"); + assert(WideRC && "Could not determine wide RC?"); + + // If the sizes differ, but the register classes are the same, there is no + // need to insert a SUBREG_TO_REG. + // + // For example, an s8 that's supposed to be a GPR will be selected to either + // a GPR32 or a GPR64 register. Note that this assumes that the s8 will + // always end up on a GPR32. + if (OrigRC == WideRC) + return Reg; + + // We have two different register classes. Insert a SUBREG_TO_REG. + unsigned SubReg = 0; + getSubRegForClass(OrigRC, TRI, SubReg); + assert(SubReg && "Couldn't determine subregister?"); + + // Build the SUBREG_TO_REG and return the new, widened register. + auto SubRegToReg = + MIB.buildInstr(AArch64::SUBREG_TO_REG, {WideRC}, {}) + .addImm(0) + .addUse(Reg) + .addImm(SubReg); + constrainSelectedInstRegOperands(*SubRegToReg, TII, TRI, RBI); + return SubRegToReg.getReg(0); +} + /// Select an "extended register" operand. This operand folds in an extend /// followed by an optional left shift. InstructionSelector::ComplexRendererFns diff --git a/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir b/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir index 2d4fd65ab05..af8e03be913 100644 --- a/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir +++ b/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir @@ -78,9 +78,8 @@ body: | ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) ; CHECK: liveins: $h0 ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, $h0, %subreg.hsub - ; CHECK: %copy:gpr32all = COPY [[SUBREG_TO_REG]] - ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %copy - ; CHECK: TBNZW [[COPY]], 3, %bb.1 + ; CHECK: %copy:gpr32 = COPY [[SUBREG_TO_REG]] + ; CHECK: TBNZW %copy, 3, %bb.1 ; CHECK: B %bb.0 ; CHECK: bb.1: ; CHECK: RET_ReallyLR diff --git a/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir b/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir new file mode 100644 index 00000000000..22963c50a2e --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir @@ -0,0 +1,193 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +# +# Test widening and narrowing on test bit operations using subregister copies +# or SUBREG_TO_REG. +--- | + @glob = external unnamed_addr global i1, align 4 + define void @s1_no_copy() { ret void } + define void @s16_no_copy() { ret void } + define void @p0_no_copy() { ret void } + define void @widen_s32_to_s64() { ret void } + define void @widen_s16_to_s64() { ret void } + define void @narrow_s64_to_s32() { ret void } + +... +--- +name: s1_no_copy +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: s1_no_copy + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: %narrow:gpr32 = IMPLICIT_DEF + ; CHECK: TBNZW %narrow, 0, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + %narrow:gpr(s1) = G_IMPLICIT_DEF + + ; There should be no copy here, because the s1 can be selected to a GPR32. + G_BRCOND %narrow(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR +... +--- +name: s16_no_copy +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: s16_no_copy + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: %narrow:gpr32 = IMPLICIT_DEF + ; CHECK: TBNZW %narrow, 0, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + %narrow:gpr(s16) = G_IMPLICIT_DEF + %trunc:gpr(s1) = G_TRUNC %narrow(s16) + + ; Look through the G_TRUNC to get the G_IMPLICIT_DEF. We don't need a + ; SUBREG_TO_REG here, because the s16 will end up on a 32-bit register. + G_BRCOND %trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR +... +--- +name: p0_no_copy +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: p0_no_copy + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: %glob:gpr64common = MOVaddr target-flags(aarch64-page) @glob, target-flags(aarch64-pageoff, aarch64-nc) @glob + ; CHECK: %load:gpr32 = LDRBBui %glob, 0 :: (dereferenceable load 1 from @glob, align 4) + ; CHECK: TBNZW %load, 0, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + %glob:gpr(p0) = G_GLOBAL_VALUE @glob + %load:gpr(s8) = G_LOAD %glob(p0) :: (dereferenceable load 1 from @glob, align 4) + %trunc:gpr(s1) = G_TRUNC %load(s8) + + ; Look through G_TRUNC to get the load. The load is into a s8, which will + ; be selected to a GPR32, so we don't need a copy. + G_BRCOND %trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR +... +--- +name: widen_s32_to_s64 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: widen_s32_to_s64 + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $w0 + ; CHECK: %reg:gpr32all = COPY $w0 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %reg, %subreg.sub_32 + ; CHECK: TBZX [[SUBREG_TO_REG]], 33, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + liveins: $w0 + %reg:gpr(s32) = COPY $w0 + %zext:gpr(s64) = G_ZEXT %reg(s32) + %bit:gpr(s64) = G_CONSTANT i64 8589934592 + %zero:gpr(s64) = G_CONSTANT i64 0 + %and:gpr(s64) = G_AND %zext, %bit + %cmp:gpr(s32) = G_ICMP intpred(eq), %and(s64), %zero + + ; We should widen using a SUBREG_TO_REG here, because we need a TBZX to get + ; bit 33. The subregister should be sub_32. + %trunc:gpr(s1) = G_TRUNC %cmp(s32) + G_BRCOND %trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR +... +--- +name: widen_s16_to_s64 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: widen_s16_to_s64 + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: %reg:gpr32 = IMPLICIT_DEF + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %reg, %subreg.sub_32 + ; CHECK: TBZX [[SUBREG_TO_REG]], 33, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + %reg:gpr(s16) = G_IMPLICIT_DEF + %zext:gpr(s64) = G_ZEXT %reg(s16) + %bit:gpr(s64) = G_CONSTANT i64 8589934592 + %zero:gpr(s64) = G_CONSTANT i64 0 + %and:gpr(s64) = G_AND %zext, %bit + %cmp:gpr(s32) = G_ICMP intpred(eq), %and(s64), %zero + + ; We should widen using a SUBREG_TO_REG here, because we need a TBZX to get + ; bit 33. The subregister should be sub_32, because s16 will end up on a + ; GPR32. + %trunc:gpr(s1) = G_TRUNC %cmp(s32) + G_BRCOND %trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR +... +--- +name: narrow_s64_to_s32 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: narrow_s64_to_s32 + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $x0 + ; CHECK: %wide:gpr64 = COPY $x0 + ; CHECK: %trunc:gpr32 = COPY %wide.sub_32 + ; CHECK: TBNZW %trunc, 0, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + liveins: $x0 + %wide:gpr(s64) = COPY $x0 + + ; We should narrow using a subregister copy here. + %trunc:gpr(s1) = G_TRUNC %wide(s64) + G_BRCOND %trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR