[AArch64][GlobalISel] Properly implement widening for TB(N)Z

When we have to widen to a 64-bit register, we have to emit a SUBREG_TO_REG. Add a general-purpose widening helpe which emits the correct SUBREG_TO_REG instruction based off of a desired size and add a testcase. Also remove some asserts which are technically incorrect in `emitTestBit`. - p0 doesn't count as a scalar type, so we need to check `!Ty.isVector()` instead - Whenever we have a s1, the Size/Bit checks are too conservative, so just remove them Replace these asserts with less conservative ones where applicable. Differential Revision: https://reviews.llvm.org/D74427
2024-11-25 04:02:41 +01:00 · 2020-02-11 10:22:38 -08:00 · 2020-02-11 10:22:38 -08:00 · 7e17214e5b
commit 7e17214e5b
parent ed8bd8f4b1
3 changed files with 254 additions and 15 deletions
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@ -273,6 +273,8 @@ private:
  /// new copy.
  Register narrowExtendRegIfNeeded(Register ExtReg,
                                             MachineIRBuilder &MIB) const;
+  Register widenGPRBankRegIfNeeded(Register Reg, unsigned Size,
+                                   MachineIRBuilder &MIB) const;
  ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;

  void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
@ -1124,26 +1126,25 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
 MachineInstr *AArch64InstructionSelector::emitTestBit(
    Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
    MachineIRBuilder &MIB) const {
-  MachineRegisterInfo &MRI = *MIB.getMRI();
-#ifndef NDEBUG
+  assert(TestReg.isValid());
  assert(ProduceNonFlagSettingCondBr &&
         "Cannot emit TB(N)Z with speculation tracking!");
-  assert(TestReg.isValid());
-  LLT Ty = MRI.getType(TestReg);
-  unsigned Size = Ty.getSizeInBits();
-  assert(Bit < Size &&
-         "Bit to test must be smaler than the size of a test register!");
-  assert(Ty.isScalar() && "Expected a scalar!");
-  assert(Size >= 32 && "Expected at least a 32-bit register!");
-#endif
+  MachineRegisterInfo &MRI = *MIB.getMRI();

  // Attempt to optimize the test bit by walking over instructions.
  TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
-  bool UseWReg = Bit < 32;
+  LLT Ty = MRI.getType(TestReg);
+  unsigned Size = Ty.getSizeInBits();
+  assert(!Ty.isVector() && "Expected a scalar!");
+  assert(Bit < 64 && "Bit is too large!");

  // When the test register is a 64-bit register, we have to narrow to make
  // TBNZW work.
-  if (UseWReg)
+  bool UseWReg = Bit < 32;
+  unsigned NecessarySize = UseWReg ? 32 : 64;
+  if (Size < NecessarySize)
+    TestReg = widenGPRBankRegIfNeeded(TestReg, NecessarySize, MIB);
+  else if (Size > NecessarySize)
    TestReg = narrowExtendRegIfNeeded(TestReg, MIB);

  static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
@ -5154,6 +5155,52 @@ Register AArch64InstructionSelector::narrowExtendRegIfNeeded(
  return Copy.getReg(0);
 }

+Register AArch64InstructionSelector::widenGPRBankRegIfNeeded(
+    Register Reg, unsigned WideSize, MachineIRBuilder &MIB) const {
+  assert(WideSize >= 8 && "WideSize is smaller than all possible registers?");
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  unsigned NarrowSize = MRI.getType(Reg).getSizeInBits();
+  assert(WideSize >= NarrowSize &&
+         "WideSize cannot be smaller than NarrowSize!");
+
+  // If the sizes match, just return the register.
+  //
+  // If NarrowSize is an s1, then we can select it to any size, so we'll treat
+  // it as a don't care.
+  if (NarrowSize == WideSize || NarrowSize == 1)
+    return Reg;
+
+  // Now check the register classes.
+  const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
+  const TargetRegisterClass *OrigRC = getMinClassForRegBank(*RB, NarrowSize);
+  const TargetRegisterClass *WideRC = getMinClassForRegBank(*RB, WideSize);
+  assert(OrigRC && "Could not determine narrow RC?");
+  assert(WideRC && "Could not determine wide RC?");
+
+  // If the sizes differ, but the register classes are the same, there is no
+  // need to insert a SUBREG_TO_REG.
+  //
+  // For example, an s8 that's supposed to be a GPR will be selected to either
+  // a GPR32 or a GPR64 register. Note that this assumes that the s8 will
+  // always end up on a GPR32.
+  if (OrigRC == WideRC)
+    return Reg;
+
+  // We have two different register classes. Insert a SUBREG_TO_REG.
+  unsigned SubReg = 0;
+  getSubRegForClass(OrigRC, TRI, SubReg);
+  assert(SubReg && "Couldn't determine subregister?");
+
+  // Build the SUBREG_TO_REG and return the new, widened register.
+  auto SubRegToReg =
+      MIB.buildInstr(AArch64::SUBREG_TO_REG, {WideRC}, {})
+          .addImm(0)
+          .addUse(Reg)
+          .addImm(SubReg);
+  constrainSelectedInstRegOperands(*SubRegToReg, TII, TRI, RBI);
+  return SubRegToReg.getReg(0);
+}
+
 /// Select an "extended register" operand. This operand folds in an extend
 /// followed by an optional left shift.
 InstructionSelector::ComplexRendererFns
--- a/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir
+++ b/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir
@ -78,9 +78,8 @@ body:             |
  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
  ; CHECK:   liveins: $h0
  ; CHECK:   [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, $h0, %subreg.hsub
-  ; CHECK:   %copy:gpr32all = COPY [[SUBREG_TO_REG]]
-  ; CHECK:   [[COPY:%[0-9]+]]:gpr32 = COPY %copy
-  ; CHECK:   TBNZW [[COPY]], 3, %bb.1
+  ; CHECK:   %copy:gpr32 = COPY [[SUBREG_TO_REG]]
+  ; CHECK:   TBNZW %copy, 3, %bb.1
  ; CHECK:   B %bb.0
  ; CHECK: bb.1:
  ; CHECK:   RET_ReallyLR
--- a/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir
+++ b/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir
@ -0,0 +1,193 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+#
+# Test widening and narrowing on test bit operations using subregister copies
+# or SUBREG_TO_REG.
+--- |
+ @glob = external unnamed_addr global i1, align 4
+ define void @s1_no_copy() { ret void }
+ define void @s16_no_copy() { ret void }
+ define void @p0_no_copy() { ret void }
+ define void @widen_s32_to_s64() { ret void }
+ define void @widen_s16_to_s64() { ret void }
+ define void @narrow_s64_to_s32() { ret void }
+
+...
+---
+name:            s1_no_copy
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: s1_no_copy
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   %narrow:gpr32 = IMPLICIT_DEF
+  ; CHECK:   TBNZW %narrow, 0, %bb.1
+  ; CHECK:   B %bb.0
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    %narrow:gpr(s1) = G_IMPLICIT_DEF
+
+    ; There should be no copy here, because the s1 can be selected to a GPR32.
+    G_BRCOND %narrow(s1), %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR
+...
+---
+name:            s16_no_copy
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: s16_no_copy
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   %narrow:gpr32 = IMPLICIT_DEF
+  ; CHECK:   TBNZW %narrow, 0, %bb.1
+  ; CHECK:   B %bb.0
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    %narrow:gpr(s16) = G_IMPLICIT_DEF
+    %trunc:gpr(s1) = G_TRUNC %narrow(s16)
+
+    ; Look through the G_TRUNC to get the G_IMPLICIT_DEF. We don't need a
+    ; SUBREG_TO_REG here, because the s16 will end up on a 32-bit register.
+    G_BRCOND %trunc(s1), %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR
+...
+---
+name:            p0_no_copy
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: p0_no_copy
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   %glob:gpr64common = MOVaddr target-flags(aarch64-page) @glob, target-flags(aarch64-pageoff, aarch64-nc) @glob
+  ; CHECK:   %load:gpr32 = LDRBBui %glob, 0 :: (dereferenceable load 1 from @glob, align 4)
+  ; CHECK:   TBNZW %load, 0, %bb.1
+  ; CHECK:   B %bb.0
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    %glob:gpr(p0) = G_GLOBAL_VALUE @glob
+    %load:gpr(s8) = G_LOAD %glob(p0) :: (dereferenceable load 1 from @glob, align 4)
+    %trunc:gpr(s1) = G_TRUNC %load(s8)
+
+    ; Look through G_TRUNC to get the load. The load is into a s8, which will
+    ; be selected to a GPR32, so we don't need a copy.
+    G_BRCOND %trunc(s1), %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR
+...
+---
+name:            widen_s32_to_s64
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: widen_s32_to_s64
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   liveins: $w0
+  ; CHECK:   %reg:gpr32all = COPY $w0
+  ; CHECK:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %reg, %subreg.sub_32
+  ; CHECK:   TBZX [[SUBREG_TO_REG]], 33, %bb.1
+  ; CHECK:   B %bb.0
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    liveins: $w0
+    %reg:gpr(s32) = COPY $w0
+    %zext:gpr(s64) = G_ZEXT %reg(s32)
+    %bit:gpr(s64) = G_CONSTANT i64 8589934592
+    %zero:gpr(s64) = G_CONSTANT i64 0
+    %and:gpr(s64) = G_AND %zext, %bit
+    %cmp:gpr(s32) = G_ICMP intpred(eq), %and(s64), %zero
+
+    ; We should widen using a SUBREG_TO_REG here, because we need a TBZX to get
+    ; bit 33. The subregister should be sub_32.
+    %trunc:gpr(s1) = G_TRUNC %cmp(s32)
+    G_BRCOND %trunc(s1), %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR
+...
+---
+name:            widen_s16_to_s64
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: widen_s16_to_s64
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   %reg:gpr32 = IMPLICIT_DEF
+  ; CHECK:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %reg, %subreg.sub_32
+  ; CHECK:   TBZX [[SUBREG_TO_REG]], 33, %bb.1
+  ; CHECK:   B %bb.0
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    %reg:gpr(s16) = G_IMPLICIT_DEF
+    %zext:gpr(s64) = G_ZEXT %reg(s16)
+    %bit:gpr(s64) = G_CONSTANT i64 8589934592
+    %zero:gpr(s64) = G_CONSTANT i64 0
+    %and:gpr(s64) = G_AND %zext, %bit
+    %cmp:gpr(s32) = G_ICMP intpred(eq), %and(s64), %zero
+
+    ; We should widen using a SUBREG_TO_REG here, because we need a TBZX to get
+    ; bit 33. The subregister should be sub_32, because s16 will end up on a
+    ; GPR32.
+    %trunc:gpr(s1) = G_TRUNC %cmp(s32)
+    G_BRCOND %trunc(s1), %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR
+...
+---
+name:            narrow_s64_to_s32
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: narrow_s64_to_s32
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   liveins: $x0
+  ; CHECK:   %wide:gpr64 = COPY $x0
+  ; CHECK:   %trunc:gpr32 = COPY %wide.sub_32
+  ; CHECK:   TBNZW %trunc, 0, %bb.1
+  ; CHECK:   B %bb.0
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    liveins: $x0
+    %wide:gpr(s64) = COPY $x0
+
+    ; We should narrow using a subregister copy here.
+    %trunc:gpr(s1) = G_TRUNC %wide(s64)
+    G_BRCOND %trunc(s1), %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR