[AArch64] Enable implicit null check transformation

This change enables the generic implicit null transformation for the AArch64 target. As background for those unfamiliar with our implicit null check support: An implicit null check is the use of a signal handler to catch and redirect to a handler a null pointer. Specifically, it's replacing an explicit conditional branch with such a redirect. This is only done for very cold branches under frontend control w/appropriate metadata. FAULTING_OP is used to wrap the faulting instruction. It is modelled as being a conditional branch to reflect the fact it can transfer control in the CFG. FAULTING_OP does not need to be an analyzable branch to achieve it's purpose. (Or at least, that's the x86 model. I find this slightly questionable.) When lowering to MC, we convert the FAULTING_OP back into the actual instruction, record the labels, and lower the original instruction. As can be seen in the test changes, currently the AArch64 backend does not eliminate the unconditional branch to the fallthrough block. I've tried two approaches, neither of which worked. I plan to return to this in a separate change set once I've wrapped my head around the interactions a bit better. (X86 handles this via AllowModify on analyzeBranch, but adding the obvious code causing BranchFolding to crash. I haven't yet figured out if it's a latent bug in BranchFolding, or something I'm doing wrong.) Differential Revision: https://reviews.llvm.org/D87851
2024-10-18 18:42:46 +02:00 · 2020-09-17 15:39:50 -07:00 · 2020-09-17 15:39:50 -07:00 · f9eea5b8c6
commit f9eea5b8c6
parent 9ae2247c05
6 changed files with 204 additions and 74 deletions
--- a/lib/CodeGen/BranchRelaxation.cpp
+++ b/lib/CodeGen/BranchRelaxation.cpp
@ -507,7 +507,14 @@ bool BranchRelaxation::relaxBranchInstructions() {
      Next = std::next(J);
      MachineInstr &MI = *J;

-      if (MI.isConditionalBranch()) {
+      if (!MI.isConditionalBranch())
+        continue;
+
+      if (MI.getOpcode() == TargetOpcode::FAULTING_OP)
+        // FAULTING_OP's destination is not encoded in the instruction stream
+        // and thus never needs relaxed.
+        continue;
+
      MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI);
      if (!isBlockInRange(MI, *DestBB)) {
        if (Next != MBB.end() && Next->isConditionalBranch()) {
@ -528,7 +535,6 @@ bool BranchRelaxation::relaxBranchInstructions() {
      }
    }
  }
-  }

  return Changed;
 }
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@ -373,6 +373,10 @@ ImplicitNullChecks::isSuitableMemoryOp(const MachineInstr &MI,
  bool OffsetIsScalable;
  const MachineOperand *BaseOp;

+  // Implementation restriction for faulting_op insertion
+  // TODO: This could be relaxed if we find a test case which warrants it.
+  if (MI.getDesc().getNumDefs() > 1)
+   return SR_Unsuitable;

  // FIXME: This handles only simple addressing mode.
  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
@ -502,9 +506,9 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
         MBP.Predicate == MachineBranchPredicate::PRED_EQ)))
    return false;

-  // If we cannot erase the test instruction itself, then making the null check
-  // implicit does not buy us much.
-  if (!MBP.SingleUseCondition)
+  // If there is a separate condition generation instruction, we chose not to
+  // transform unless we can remove both condition and consuming branch.
+  if (MBP.ConditionDef && !MBP.SingleUseCondition)
    return false;

  MachineBasicBlock *NotNullSucc, *NullSucc;
@ -522,6 +526,9 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
  if (NotNullSucc->pred_size() != 1)
    return false;

+  const Register PointerReg = MBP.LHS.getReg();
+
+  if (MBP.ConditionDef) {
    // To prevent the invalid transformation of the following code:
    //
    //   mov %rax, %rcx
@ -540,14 +547,13 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
    //
    // we must ensure that there are no instructions between the 'test' and
    // conditional jump that modify %rax.
-  const Register PointerReg = MBP.LHS.getReg();
-
-  assert(MBP.ConditionDef->getParent() ==  &MBB && "Should be in basic block");
+    assert(MBP.ConditionDef->getParent() ==  &MBB &&
+           "Should be in basic block");

    for (auto I = MBB.rbegin(); MBP.ConditionDef != &*I; ++I)
      if (I->modifiesRegister(PointerReg, TRI))
        return false;
-
+  }
  // Starting with a code fragment like:
  //
  //   test %rax, %rax
@ -726,9 +732,11 @@ void ImplicitNullChecks::rewriteNullChecks(
    }

    NC.getMemOperation()->eraseFromParent();
-    NC.getCheckOperation()->eraseFromParent();
+    if (auto *CheckOp = NC.getCheckOperation())
+      CheckOp->eraseFromParent();

-    // Insert an *unconditional* branch to not-null successor.
+    // Insert an *unconditional* branch to not-null successor - we expect
+    // block placement to remove fallthroughs later.
    TII->insertBranch(*NC.getCheckBlock(), NC.getNotNullSucc(), nullptr,
                      /*Cond=*/None, DL);

--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@ -32,6 +32,7 @@
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/FaultMaps.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@ -69,12 +70,13 @@ namespace {
 class AArch64AsmPrinter : public AsmPrinter {
  AArch64MCInstLower MCInstLowering;
  StackMaps SM;
+  FaultMaps FM;
  const AArch64Subtarget *STI;

 public:
  AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
      : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this),
-        SM(*this) {}
+        SM(*this), FM(*this) {}

  StringRef getPassName() const override { return "AArch64 Assembly Printer"; }

@ -97,6 +99,7 @@ public:
                       const MachineInstr &MI);
  void LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM,
                       const MachineInstr &MI);
+  void LowerFAULTING_OP(const MachineInstr &MI);

  void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
  void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
@ -524,7 +527,11 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) {
    // generates code that does this, it is always safe to set.
    OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
  }
+  
+  // Emit stack and fault map information.
  emitStackMaps(SM);
+  FM.serializeToFaultMapSection();
+
 }

 void AArch64AsmPrinter::EmitLOHs() {
@ -970,6 +977,42 @@ void AArch64AsmPrinter::LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM,
  SM.recordStatepoint(*MILabel, MI);
 }

+void AArch64AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI) {
+  // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
+  //                  <opcode>, <operands>
+
+  Register DefRegister = FaultingMI.getOperand(0).getReg();
+  FaultMaps::FaultKind FK =
+      static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
+  MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
+  unsigned Opcode = FaultingMI.getOperand(3).getImm();
+  unsigned OperandsBeginIdx = 4;
+
+  auto &Ctx = OutStreamer->getContext();
+  MCSymbol *FaultingLabel = Ctx.createTempSymbol();
+  OutStreamer->emitLabel(FaultingLabel);
+
+  assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
+  FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel);
+
+  MCInst MI;
+  MI.setOpcode(Opcode);
+
+  if (DefRegister != (Register)0)
+    MI.addOperand(MCOperand::createReg(DefRegister));
+
+  for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx,
+            E = FaultingMI.operands_end();
+       I != E; ++I) {
+    MCOperand Dest;
+    lowerOperand(*I, Dest);
+    MI.addOperand(Dest);
+  }
+
+  OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
+  OutStreamer->emitInstruction(MI, getSubtargetInfo());
+}
+
 void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
  Register DestReg = MI.getOperand(0).getReg();
  if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
@ -1254,6 +1297,9 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
  case TargetOpcode::STATEPOINT:
    return LowerSTATEPOINT(*OutStreamer, SM, *MI);

+  case TargetOpcode::FAULTING_OP:
+    return LowerFAULTING_OP(*MI);
+
  case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
    LowerPATCHABLE_FUNCTION_ENTER(*MI);
    return;
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@ -328,6 +328,56 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
  return true;
 }

+bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
+                                              MachineBranchPredicate &MBP,
+                                              bool AllowModify) const {
+  // For the moment, handle only a block which ends with a cb(n)zx followed by
+  // a fallthrough.  Why this?  Because it is a common form.
+  // TODO: Should we handle b.cc?
+
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return true;
+
+  // Skip over SpeculationBarrierEndBB terminators
+  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
+      I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
+    --I;
+  }
+
+  if (!isUnpredicatedTerminator(*I))
+    return true;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = &*I;
+  unsigned LastOpc = LastInst->getOpcode();
+  if (!isCondBranchOpcode(LastOpc))
+    return true;
+
+  switch (LastOpc) {
+  default:
+    return true;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    break;
+  };
+
+  MBP.TrueDest = LastInst->getOperand(1).getMBB();
+  assert(MBP.TrueDest && "expected!");
+  MBP.FalseDest = MBB.getNextNode();
+
+  MBP.ConditionDef = nullptr;
+  MBP.SingleUseCondition = false;
+
+  MBP.LHS = LastInst->getOperand(0);
+  MBP.RHS = MachineOperand::CreateImm(0);
+  MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
+                                            : MachineBranchPredicate::PRED_EQ;
+  return false;
+}
+
 bool AArch64InstrInfo::reverseBranchCondition(
    SmallVectorImpl<MachineOperand> &Cond) const {
  if (Cond[0].getImm() != -1) {
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@ -188,6 +188,9 @@ public:
                     MachineBasicBlock *&FBB,
                     SmallVectorImpl<MachineOperand> &Cond,
                     bool AllowModify = false) const override;
+  bool analyzeBranchPredicate(MachineBasicBlock &MBB,
+                              MachineBranchPredicate &MBP,
+                              bool AllowModify) const override;
  unsigned removeBranch(MachineBasicBlock &MBB,
                        int *BytesRemoved = nullptr) const override;
  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
--- a/test/CodeGen/AArch64/implicit-null-check.ll
+++ b/test/CodeGen/AArch64/implicit-null-check.ll
@ -5,15 +5,14 @@
 ; file with the same name in the X86 tree, but adjusted to remove patterns
 ; related to memory folding of arithmetic (since aarch64 doesn't), and add
 ; a couple of aarch64 specific tests.
-; NOTE: Currently negative tests as these are being precommitted before
-; the changes to enable.

 define i32 @imp_null_check_load_fallthrough(i32* %x) {
 ; CHECK-LABEL: imp_null_check_load_fallthrough:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB0_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    ldr w0, [x0] // on-fault: .LBB0_2
+; CHECK-NEXT:    b .LBB0_1
+; CHECK-NEXT:  .LBB0_1: // %not_null
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_2: // %is_null
 ; CHECK-NEXT:    mov w0, #42
@ -34,9 +33,10 @@ is_null:
 define i32 @imp_null_check_load_reorder(i32* %x) {
 ; CHECK-LABEL: imp_null_check_load_reorder:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB1_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    ldr w0, [x0] // on-fault: .LBB1_2
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_1: // %not_null
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_2: // %is_null
 ; CHECK-NEXT:    mov w0, #42
@ -56,9 +56,10 @@ define i32 @imp_null_check_load_reorder(i32* %x) {
 define i32 @imp_null_check_unordered_load(i32* %x) {
 ; CHECK-LABEL: imp_null_check_unordered_load:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB2_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:  .Ltmp2:
+; CHECK-NEXT:    ldr w0, [x0] // on-fault: .LBB2_2
+; CHECK-NEXT:    b .LBB2_1
+; CHECK-NEXT:  .LBB2_1: // %not_null
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB2_2: // %is_null
 ; CHECK-NEXT:    mov w0, #42
@ -76,6 +77,8 @@ define i32 @imp_null_check_unordered_load(i32* %x) {
 }


+; TODO: Can be converted into implicit check.
+;; Probably could be implicit, but we're conservative for now
 define i32 @imp_null_check_seq_cst_load(i32* %x) {
 ; CHECK-LABEL: imp_null_check_seq_cst_load:
 ; CHECK:       // %bb.0: // %entry
@ -125,9 +128,10 @@ define i32 @imp_null_check_volatile_load(i32* %x) {
 define i8 @imp_null_check_load_i8(i8* %x) {
 ; CHECK-LABEL: imp_null_check_load_i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB5_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldrb w0, [x0]
+; CHECK-NEXT:  .Ltmp3:
+; CHECK-NEXT:    ldrb w0, [x0] // on-fault: .LBB5_2
+; CHECK-NEXT:    b .LBB5_1
+; CHECK-NEXT:  .LBB5_1: // %not_null
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB5_2: // %is_null
 ; CHECK-NEXT:    mov w0, #42
@ -176,9 +180,10 @@ define i256 @imp_null_check_load_i256(i256* %x) {
 define i32 @imp_null_check_gep_load(i32* %x) {
 ; CHECK-LABEL: imp_null_check_gep_load:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB7_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w0, [x0, #128]
+; CHECK-NEXT:  .Ltmp4:
+; CHECK-NEXT:    ldr w0, [x0, #128] // on-fault: .LBB7_2
+; CHECK-NEXT:    b .LBB7_1
+; CHECK-NEXT:  .LBB7_1: // %not_null
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB7_2: // %is_null
 ; CHECK-NEXT:    mov w0, #42
@ -199,9 +204,10 @@ define i32 @imp_null_check_gep_load(i32* %x) {
 define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
 ; CHECK-LABEL: imp_null_check_add_result:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB8_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:  .Ltmp5:
+; CHECK-NEXT:    ldr w8, [x0] // on-fault: .LBB8_2
+; CHECK-NEXT:    b .LBB8_1
+; CHECK-NEXT:  .LBB8_1: // %not_null
 ; CHECK-NEXT:    add w0, w8, w1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB8_2: // %is_null
@ -225,9 +231,10 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
 define i32 @imp_null_check_hoist_over_udiv(i32* %x, i32 %a, i32 %b) {
 ; CHECK-LABEL: imp_null_check_hoist_over_udiv:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB9_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:  .Ltmp6:
+; CHECK-NEXT:    ldr w8, [x0] // on-fault: .LBB9_2
+; CHECK-NEXT:    b .LBB9_1
+; CHECK-NEXT:  .LBB9_1: // %not_null
 ; CHECK-NEXT:    udiv w9, w1, w2
 ; CHECK-NEXT:    add w0, w8, w9
 ; CHECK-NEXT:    ret
@ -249,6 +256,8 @@ define i32 @imp_null_check_hoist_over_udiv(i32* %x, i32 %a, i32 %b) {
 }


+; TODO: We should be able to hoist this - we can on x86, why isn't this
+; working for aarch64?  Aliasing?
 define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z) {
 ; CHECK-LABEL: imp_null_check_hoist_over_unrelated_load:
 ; CHECK:       // %bb.0: // %entry
@ -278,9 +287,10 @@ define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z)
 define i32 @imp_null_check_gep_load_with_use_dep(i32* %x, i32 %a) {
 ; CHECK-LABEL: imp_null_check_gep_load_with_use_dep:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB11_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:  .Ltmp7:
+; CHECK-NEXT:    ldr w8, [x0] // on-fault: .LBB11_2
+; CHECK-NEXT:    b .LBB11_1
+; CHECK-NEXT:  .LBB11_1: // %not_null
 ; CHECK-NEXT:    add w9, w0, w1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w0, w8, #4 // =4
@ -304,6 +314,8 @@ define i32 @imp_null_check_gep_load_with_use_dep(i32* %x, i32 %a) {
  ret i32 %z
 }

+;; TODO: We could handle this case as we can lift the fence into the
+;; previous block before the conditional without changing behavior.
 define i32 @imp_null_check_load_fence1(i32* %x) {
 ; CHECK-LABEL: imp_null_check_load_fence1:
 ; CHECK:       // %bb.0: // %entry
@ -328,6 +340,8 @@ not_null:
  ret i32 %t
 }

+;; TODO: We could handle this case as we can lift the fence into the
+;; previous block before the conditional without changing behavior.
 define i32 @imp_null_check_load_fence2(i32* %x) {
 ; CHECK-LABEL: imp_null_check_load_fence2:
 ; CHECK:       // %bb.0: // %entry
@ -352,6 +366,7 @@ not_null:
  ret i32 %t
 }

+; TODO: We can fold to implicit null here, not sure why this isn't working
 define void @imp_null_check_store(i32* %x) {
 ; CHECK-LABEL: imp_null_check_store:
 ; CHECK:       // %bb.0: // %entry
@ -374,6 +389,7 @@ define void @imp_null_check_store(i32* %x) {
  ret void
 }

+;; TODO: can be implicit
 define void @imp_null_check_unordered_store(i32* %x) {
 ; CHECK-LABEL: imp_null_check_unordered_store:
 ; CHECK:       // %bb.0: // %entry
@ -399,9 +415,10 @@ define void @imp_null_check_unordered_store(i32* %x) {
 define i32 @imp_null_check_neg_gep_load(i32* %x) {
 ; CHECK-LABEL: imp_null_check_neg_gep_load:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB16_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldur w0, [x0, #-128]
+; CHECK-NEXT:  .Ltmp8:
+; CHECK-NEXT:    ldur w0, [x0, #-128] // on-fault: .LBB16_2
+; CHECK-NEXT:    b .LBB16_1
+; CHECK-NEXT:  .LBB16_1: // %not_null
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB16_2: // %is_null
 ; CHECK-NEXT:    mov w0, #42