[AArch64] Extend AArch64SLSHardeningPass to harden BLR instructions.

To make sure that no barrier gets placed on the architectural execution path, each BLR x<N> instruction gets transformed to a BL __llvm_slsblr_thunk_x<N> instruction, with __llvm_slsblr_thunk_x<N> a thunk that contains __llvm_slsblr_thunk_x<N>: BR x<N> <speculation barrier> Therefore, the BLR instruction gets split into 2; one BL and one BR. This transformation results in not inserting a speculation barrier on the architectural execution path. The mitigation is off by default and can be enabled by the harden-sls-blr subtarget feature. As a linker is allowed to clobber X16 and X17 on function calls, the above code transformation would not be correct in case a linker does so when N=16 or N=17. Therefore, when the mitigation is enabled, generation of BLR x16 or BLR x17 is avoided. As BLRA* indirect calls are not produced by LLVM currently, this does not aim to implement support for those. Differential Revision: https://reviews.llvm.org/D81402
2025-01-31 20:51:52 +01:00 · 2020-06-11 09:23:15 +01:00 · 2020-06-11 09:23:15 +01:00 · cd0b5e8976
commit cd0b5e8976
parent 16cd3a8a43
16 changed files with 526 additions and 36 deletions
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@ -39,6 +39,7 @@ FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
 FunctionPass *createAArch64StorePairSuppressPass();
 FunctionPass *createAArch64ExpandPseudoPass();
 FunctionPass *createAArch64SLSHardeningPass();
+FunctionPass *createAArch64IndirectThunks();
 FunctionPass *createAArch64SpeculationHardeningPass();
 FunctionPass *createAArch64LoadStoreOptimizationPass();
 FunctionPass *createAArch64SIMDInstrOptPass();
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@ -464,6 +464,9 @@ def FeatureUseEL#i#ForTP : SubtargetFeature<"tpidr-el"#i, "UseEL"#i#"ForTP",
 def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr",
  "HardenSlsRetBr", "true",
  "Harden against straight line speculation across RET and BR instructions">;
+def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr",
+  "HardenSlsBlr", "true",
+  "Harden against straight line speculation across BLR instructions">;

 //===----------------------------------------------------------------------===//
 // AArch64 Processors supported.
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@ -3270,7 +3270,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
  // Issue the call.
  MachineInstrBuilder MIB;
  if (Subtarget->useSmallAddressing()) {
-    const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
+    const MCInstrDesc &II = TII.get(Addr.getReg() ? getBLRCallOpcode(*MF) : AArch64::BL);
    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
    if (Symbol)
      MIB.addSym(Symbol, 0);
@ -3303,7 +3303,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
    if (!CallReg)
      return false;

-    const MCInstrDesc &II = TII.get(AArch64::BLR);
+    const MCInstrDesc &II = TII.get(getBLRCallOpcode(*MF));
    CallReg = constrainOperandRegClass(II, CallReg, 0);
    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg);
  }
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@ -1126,7 +1126,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
            .setMIFlag(MachineInstr::FrameSetup);
      }

-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR))
+      BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
          .addReg(AArch64::X16, RegState::Kill)
          .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
          .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@ -6092,7 +6092,9 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
  }

  else if (LastInstrOpcode == AArch64::BL ||
-           (LastInstrOpcode == AArch64::BLR && !HasBTI)) {
+           ((LastInstrOpcode == AArch64::BLR ||
+             LastInstrOpcode == AArch64::BLRNoIP) &&
+            !HasBTI)) {
    // FIXME: Do we need to check if the code after this uses the value of LR?
    FrameID = MachineOutlinerThunk;
    NumBytesToCreateFrame = 0;
@ -6409,7 +6411,8 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
    // as a tail-call.  Whitelist the call instructions we know about so we
    // don't get unexpected results with call pseudo-instructions.
    auto UnknownCallOutlineType = outliner::InstrType::Illegal;
-    if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL)
+    if (MI.getOpcode() == AArch64::BLR ||
+        MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
      UnknownCallOutlineType = outliner::InstrType::LegalTerminator;

    if (!Callee)
@ -6557,7 +6560,8 @@ void AArch64InstrInfo::buildOutlinedFrame(
    if (Call->getOpcode() == AArch64::BL) {
      TailOpcode = AArch64::TCRETURNdi;
    } else {
-      assert(Call->getOpcode() == AArch64::BLR);
+      assert(Call->getOpcode() == AArch64::BLR ||
+             Call->getOpcode() == AArch64::BLRNoIP);
      TailOpcode = AArch64::TCRETURNriALL;
    }
    MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
@ -6893,6 +6897,13 @@ uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
  return get(Opc).TSFlags & AArch64::ElementSizeMask;
 }

+unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
+  if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
+    return AArch64::BLRNoIP;
+  else
+    return AArch64::BLR;
+}
+
 #define GET_INSTRINFO_HELPERS
 #define GET_INSTRMAP_INFO
 #include "AArch64GenInstrInfo.inc"
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@ -397,6 +397,9 @@ static inline bool isIndirectBranchOpcode(int Opc) {
  return false;
 }

+/// Return opcode to be used for indirect calls.
+unsigned getBLRCallOpcode(const MachineFunction &MF);
+
 // struct TSFlags {
 #define TSFLAG_ELEMENT_SIZE_TYPE(X)      (X)       // 3-bits
 #define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 4-bit
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@ -589,6 +589,8 @@ let RecomputePerFunction = 1 in {
  def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
  def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;

+  def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
+  def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
  // Toggles patterns which aren't beneficial in GlobalISel when we aren't
  // optimizing. This allows us to selectively use patterns without impacting
  // SelectionDAG's behaviour.
@ -2020,9 +2022,19 @@ def ERET : SpecialReturn<0b0100, "eret">;
 def : InstAlias<"ret", (RET LR)>;

 let isCall = 1, Defs = [LR], Uses = [SP] in {
-def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
+  def BLR : BranchReg<0b0001, "blr", []>;
+  def BLRNoIP : Pseudo<(outs), (ins GPR64noip:$Rn), []>,
+                Sched<[WriteBrReg]>,
+                PseudoInstExpansion<(BLR GPR64:$Rn)>;
 } // isCall

+def : Pat<(AArch64call GPR64:$Rn),
+          (BLR GPR64:$Rn)>,
+      Requires<[NoSLSBLRMitigation]>;
+def : Pat<(AArch64call GPR64noip:$Rn),
+          (BLRNoIP GPR64noip:$Rn)>,
+      Requires<[SLSBLRMitigation]>;
+
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
 } // isBranch, isTerminator, isBarrier, isIndirectBranch
--- a/lib/Target/AArch64/AArch64SLSHardening.cpp
+++ b/lib/Target/AArch64/AArch64SLSHardening.cpp
@ -16,6 +16,7 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/IndirectThunks.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@ -57,9 +58,9 @@ public:

 private:
  bool hardenReturnsAndBRs(MachineBasicBlock &MBB) const;
-  void insertSpeculationBarrier(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MBBI,
-                                DebugLoc DL) const;
+  bool hardenBLRs(MachineBasicBlock &MBB) const;
+  MachineBasicBlock &ConvertBLRToBL(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator) const;
 };

 } // end anonymous namespace
@ -69,20 +70,26 @@ char AArch64SLSHardening::ID = 0;
 INITIALIZE_PASS(AArch64SLSHardening, "aarch64-sls-hardening",
                AARCH64_SLS_HARDENING_NAME, false, false)

-void AArch64SLSHardening::insertSpeculationBarrier(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    DebugLoc DL) const {
+static void insertSpeculationBarrier(const AArch64Subtarget *ST,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     DebugLoc DL,
+                                     bool AlwaysUseISBDSB = false) {
  assert(MBBI != MBB.begin() &&
         "Must not insert SpeculationBarrierEndBB as only instruction in MBB.");
  assert(std::prev(MBBI)->isBarrier() &&
         "SpeculationBarrierEndBB must only follow unconditional control flow "
         "instructions.");
  assert(std::prev(MBBI)->isTerminator() &&
-         "SpeculatoinBarrierEndBB must only follow terminators.");
-  if (ST->hasSB())
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SpeculationBarrierSBEndBB));
-  else
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SpeculationBarrierISBDSBEndBB));
+         "SpeculationBarrierEndBB must only follow terminators.");
+  const TargetInstrInfo *TII = ST->getInstrInfo();
+  unsigned BarrierOpc = ST->hasSB() && !AlwaysUseISBDSB
+                            ? AArch64::SpeculationBarrierSBEndBB
+                            : AArch64::SpeculationBarrierISBDSBEndBB;
+  if (MBBI == MBB.end() ||
+      (MBBI->getOpcode() != AArch64::SpeculationBarrierSBEndBB &&
+       MBBI->getOpcode() != AArch64::SpeculationBarrierISBDSBEndBB))
+    BuildMI(MBB, MBBI, DL, TII->get(BarrierOpc));
 }

 bool AArch64SLSHardening::runOnMachineFunction(MachineFunction &MF) {
@ -91,12 +98,30 @@ bool AArch64SLSHardening::runOnMachineFunction(MachineFunction &MF) {
  TRI = MF.getSubtarget().getRegisterInfo();

  bool Modified = false;
-  for (auto &MBB : MF)
+  for (auto &MBB : MF) {
    Modified |= hardenReturnsAndBRs(MBB);
+    Modified |= hardenBLRs(MBB);
+  }

  return Modified;
 }

+static bool isBLR(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AArch64::BLR:
+  case AArch64::BLRNoIP:
+    return true;
+  case AArch64::BLRAA:
+  case AArch64::BLRAB:
+  case AArch64::BLRAAZ:
+  case AArch64::BLRABZ:
+    llvm_unreachable("Currently, LLVM's code generator does not support "
+                     "producing BLRA* instructions. Therefore, there's no "
+                     "support in this pass for those instructions.");
+  }
+  return false;
+}
+
 bool AArch64SLSHardening::hardenReturnsAndBRs(MachineBasicBlock &MBB) const {
  if (!ST->hardenSlsRetBr())
    return false;
@ -108,7 +133,244 @@ bool AArch64SLSHardening::hardenReturnsAndBRs(MachineBasicBlock &MBB) const {
    NextMBBI = std::next(MBBI);
    if (MI.isReturn() || isIndirectBranchOpcode(MI.getOpcode())) {
      assert(MI.isTerminator());
-      insertSpeculationBarrier(MBB, std::next(MBBI), MI.getDebugLoc());
+      insertSpeculationBarrier(ST, MBB, std::next(MBBI), MI.getDebugLoc());
+      Modified = true;
+    }
+  }
+  return Modified;
+}
+
+static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_";
+
+static std::array<const char *, 29> SLSBLRThunkNames{
+    "__llvm_slsblr_thunk_x0",  "__llvm_slsblr_thunk_x1",
+    "__llvm_slsblr_thunk_x2",  "__llvm_slsblr_thunk_x3",
+    "__llvm_slsblr_thunk_x4",  "__llvm_slsblr_thunk_x5",
+    "__llvm_slsblr_thunk_x6",  "__llvm_slsblr_thunk_x7",
+    "__llvm_slsblr_thunk_x8",  "__llvm_slsblr_thunk_x9",
+    "__llvm_slsblr_thunk_x10", "__llvm_slsblr_thunk_x11",
+    "__llvm_slsblr_thunk_x12", "__llvm_slsblr_thunk_x13",
+    "__llvm_slsblr_thunk_x14", "__llvm_slsblr_thunk_x15",
+    // X16 and X17 are deliberately missing, as the mitigation requires those
+    // register to not be used in BLR. See comment in ConvertBLRToBL for more
+    // details.
+    "__llvm_slsblr_thunk_x18", "__llvm_slsblr_thunk_x19",
+    "__llvm_slsblr_thunk_x20", "__llvm_slsblr_thunk_x21",
+    "__llvm_slsblr_thunk_x22", "__llvm_slsblr_thunk_x23",
+    "__llvm_slsblr_thunk_x24", "__llvm_slsblr_thunk_x25",
+    "__llvm_slsblr_thunk_x26", "__llvm_slsblr_thunk_x27",
+    "__llvm_slsblr_thunk_x28", "__llvm_slsblr_thunk_x29",
+    // X30 is deliberately missing, for similar reasons as X16 and X17 are
+    // missing.
+    "__llvm_slsblr_thunk_x31",
+};
+static std::array<unsigned, 29> SLSBLRThunkRegs{
+    AArch64::X0,  AArch64::X1,  AArch64::X2,  AArch64::X3,  AArch64::X4,
+    AArch64::X5,  AArch64::X6,  AArch64::X7,  AArch64::X8,  AArch64::X9,
+    AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14,
+    AArch64::X15, AArch64::X18, AArch64::X19, AArch64::X20, AArch64::X21,
+    AArch64::X22, AArch64::X23, AArch64::X24, AArch64::X25, AArch64::X26,
+    AArch64::X27, AArch64::X28, AArch64::FP,  AArch64::XZR};
+
+namespace {
+struct SLSBLRThunkInserter : ThunkInserter<SLSBLRThunkInserter> {
+  const char *getThunkPrefix() { return SLSBLRNamePrefix; }
+  bool mayUseThunk(const MachineFunction &MF) {
+    // FIXME: This could also check if there are any BLRs in the function
+    // to more accurately reflect if a thunk will be needed.
+    return MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr();
+  }
+  void insertThunks(MachineModuleInfo &MMI);
+  void populateThunk(MachineFunction &MF);
+};
+} // namespace
+
+void SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI) {
+  // FIXME: It probably would be possible to filter which thunks to produce
+  // based on which registers are actually used in BLR instructions in this
+  // function. But would that be a worthwhile optimization?
+  for (StringRef Name : SLSBLRThunkNames)
+    createThunkFunction(MMI, Name);
+}
+
+void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) {
+  // FIXME: How to better communicate Register number, rather than through
+  // name and lookup table?
+  assert(MF.getName().startswith(getThunkPrefix()));
+  int Index = -1;
+  for (int i = 0; i < (int)SLSBLRThunkNames.size(); ++i)
+    if (MF.getName() == SLSBLRThunkNames[i]) {
+      Index = i;
+      break;
+    }
+  assert(Index != -1);
+  Register ThunkReg = SLSBLRThunkRegs[Index];
+
+  const TargetInstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  // Grab the entry MBB and erase any other blocks. O0 codegen appears to
+  // generate two bbs for the entry block.
+  MachineBasicBlock *Entry = &MF.front();
+  Entry->clear();
+  while (MF.size() > 1)
+    MF.erase(std::next(MF.begin()));
+
+  //  These thunks need to consist of the following instructions:
+  //  __llvm_slsblr_thunk_xN:
+  //      BR xN
+  //      barrierInsts
+  Entry->addLiveIn(ThunkReg);
+  BuildMI(Entry, DebugLoc(), TII->get(AArch64::BR)).addReg(ThunkReg);
+  // Make sure the thunks do not make use of the SB extension in case there is
+  // a function somewhere that will call to it that for some reason disabled
+  // the SB extension locally on that function, even though it's enabled for
+  // the module otherwise. Therefore set AlwaysUseISBSDB to true.
+  insertSpeculationBarrier(&MF.getSubtarget<AArch64Subtarget>(), *Entry,
+                           Entry->end(), DebugLoc(), true /*AlwaysUseISBDSB*/);
+}
+
+MachineBasicBlock &
+AArch64SLSHardening::ConvertBLRToBL(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI) const {
+  // Transform a BLR to a BL as follows:
+  // Before:
+  //   |-----------------------------|
+  //   |      ...                    |
+  //   |  instI                      |
+  //   |  BLR xN                     |
+  //   |  instJ                      |
+  //   |      ...                    |
+  //   |-----------------------------|
+  //
+  // After:
+  //   |-----------------------------|
+  //   |      ...                    |
+  //   |  instI                      |
+  //   |  BL __llvm_slsblr_thunk_xN  |
+  //   |  instJ                      |
+  //   |      ...                    |
+  //   |-----------------------------|
+  //
+  //   __llvm_slsblr_thunk_xN:
+  //   |-----------------------------|
+  //   |  BR xN                      |
+  //   |  barrierInsts               |
+  //   |-----------------------------|
+  //
+  // The __llvm_slsblr_thunk_xN thunks are created by the SLSBLRThunkInserter.
+  // This function merely needs to transform BLR xN into BL
+  // __llvm_slsblr_thunk_xN.
+  //
+  // Since linkers are allowed to clobber X16 and X17 on function calls, the
+  // above mitigation only works if the original BLR instruction was not
+  // BLR X16 nor BLR X17. Code generation before must make sure that no BLR
+  // X16|X17 was produced if the mitigation is enabled.
+
+  MachineInstr &BLR = *MBBI;
+  assert(isBLR(BLR));
+  unsigned BLOpcode;
+  Register Reg;
+  bool RegIsKilled;
+  switch (BLR.getOpcode()) {
+  case AArch64::BLR:
+  case AArch64::BLRNoIP:
+    BLOpcode = AArch64::BL;
+    Reg = BLR.getOperand(0).getReg();
+    assert(Reg != AArch64::X16 && Reg != AArch64::X17 && Reg != AArch64::LR);
+    RegIsKilled = BLR.getOperand(0).isKill();
+    break;
+  case AArch64::BLRAA:
+  case AArch64::BLRAB:
+  case AArch64::BLRAAZ:
+  case AArch64::BLRABZ:
+    llvm_unreachable("BLRA instructions cannot yet be produced by LLVM, "
+                     "therefore there is no need to support them for now.");
+  default:
+    llvm_unreachable("unhandled BLR");
+  }
+  DebugLoc DL = BLR.getDebugLoc();
+
+  // If we'd like to support also BLRAA and BLRAB instructions, we'd need
+  // a lot more different kind of thunks.
+  // For example, a
+  //
+  // BLRAA xN, xM
+  //
+  // instruction probably would need to be transformed to something like:
+  //
+  // BL __llvm_slsblraa_thunk_x<N>_x<M>
+  //
+  // __llvm_slsblraa_thunk_x<N>_x<M>:
+  //   BRAA x<N>, x<M>
+  //   barrierInsts
+  //
+  // Given that about 30 different values of N are possible and about 30
+  // different values of M are possible in the above, with the current way
+  // of producing indirect thunks, we'd be producing about 30 times 30, i.e.
+  // about 900 thunks (where most might not be actually called). This would
+  // multiply further by two to support both BLRAA and BLRAB variants of those
+  // instructions.
+  // If we'd want to support this, we'd probably need to look into a different
+  // way to produce thunk functions, based on which variants are actually
+  // needed, rather than producing all possible variants.
+  // So far, LLVM does never produce BLRA* instructions, so let's leave this
+  // for the future when LLVM can start producing BLRA* instructions.
+  MachineFunction &MF = *MBBI->getMF();
+  MCContext &Context = MBB.getParent()->getContext();
+  MCSymbol *Sym = Context.getOrCreateSymbol("__llvm_slsblr_thunk_x" +
+                                            utostr(Reg - AArch64::X0));
+
+  MachineInstr *BL = BuildMI(MBB, MBBI, DL, TII->get(BLOpcode)).addSym(Sym);
+
+  // Now copy the implicit operands from BLR to BL and copy other necessary
+  // info.
+  // However, both BLR and BL instructions implictly use SP and implicitly
+  // define LR. Blindly copying implicit operands would result in SP and LR
+  // operands to be present multiple times. While this may not be too much of
+  // an issue, let's avoid that for cleanliness, by removing those implicit
+  // operands from the BL created above before we copy over all implicit
+  // operands from the BLR.
+  int ImpLROpIdx = -1;
+  int ImpSPOpIdx = -1;
+  for (unsigned OpIdx = BL->getNumExplicitOperands();
+       OpIdx < BL->getNumOperands(); OpIdx++) {
+    MachineOperand Op = BL->getOperand(OpIdx);
+    if (!Op.isReg())
+      continue;
+    if (Op.getReg() == AArch64::LR && Op.isDef())
+      ImpLROpIdx = OpIdx;
+    if (Op.getReg() == AArch64::SP && !Op.isDef())
+      ImpSPOpIdx = OpIdx;
+  }
+  assert(ImpLROpIdx != -1);
+  assert(ImpSPOpIdx != -1);
+  int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx);
+  int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx);
+  BL->RemoveOperand(FirstOpIdxToRemove);
+  BL->RemoveOperand(SecondOpIdxToRemove);
+  // Now copy over the implicit operands from the original BLR
+  BL->copyImplicitOps(MF, BLR);
+  MF.moveCallSiteInfo(&BLR, BL);
+  // Also add the register called in the BLR as being used in the called thunk.
+  BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/,
+                                           RegIsKilled /*isKill*/));
+  // Remove BLR instruction
+  MBB.erase(MBBI);
+
+  return MBB;
+}
+
+bool AArch64SLSHardening::hardenBLRs(MachineBasicBlock &MBB) const {
+  if (!ST->hardenSlsBlr())
+    return false;
+  bool Modified = false;
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  MachineBasicBlock::iterator NextMBBI;
+  for (; MBBI != E; MBBI = NextMBBI) {
+    MachineInstr &MI = *MBBI;
+    NextMBBI = std::next(MBBI);
+    if (isBLR(MI)) {
+      ConvertBLRToBL(MBB, MBBI);
      Modified = true;
    }
  }
@ -118,3 +380,60 @@ bool AArch64SLSHardening::hardenReturnsAndBRs(MachineBasicBlock &MBB) const {
 FunctionPass *llvm::createAArch64SLSHardeningPass() {
  return new AArch64SLSHardening();
 }
+
+namespace {
+class AArch64IndirectThunks : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AArch64IndirectThunks() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "AArch64 Indirect Thunks"; }
+
+  bool doInitialization(Module &M) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
+  }
+
+private:
+  std::tuple<SLSBLRThunkInserter> TIs;
+
+  // FIXME: When LLVM moves to C++17, these can become folds
+  template <typename... ThunkInserterT>
+  static void initTIs(Module &M,
+                      std::tuple<ThunkInserterT...> &ThunkInserters) {
+    (void)std::initializer_list<int>{
+        (std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...};
+  }
+  template <typename... ThunkInserterT>
+  static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
+                     std::tuple<ThunkInserterT...> &ThunkInserters) {
+    bool Modified = false;
+    (void)std::initializer_list<int>{
+        Modified |= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...};
+    return Modified;
+  }
+};
+
+} // end anonymous namespace
+
+char AArch64IndirectThunks::ID = 0;
+
+FunctionPass *llvm::createAArch64IndirectThunks() {
+  return new AArch64IndirectThunks();
+}
+
+bool AArch64IndirectThunks::doInitialization(Module &M) {
+  initTIs(M, TIs);
+  return false;
+}
+
+bool AArch64IndirectThunks::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << getPassName() << '\n');
+  auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+  return runTIs(MMI, MF, TIs);
+}
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@ -211,6 +211,7 @@ protected:
  bool UseEL3ForTP = false;
  bool AllowTaggedGlobals = false;
  bool HardenSlsRetBr = false;
+  bool HardenSlsBlr = false;
  uint8_t MaxInterleaveFactor = 2;
  uint8_t VectorInsertExtractBaseCost = 3;
  uint16_t CacheLineSize = 0;
@ -365,6 +366,7 @@ public:
  }

  bool hardenSlsRetBr() const { return HardenSlsRetBr; }
+  bool hardenSlsBlr() const { return HardenSlsBlr; }

  bool useEL1ForTP() const { return UseEL1ForTP; }
  bool useEL2ForTP() const { return UseEL2ForTP; }
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@ -636,6 +636,7 @@ void AArch64PassConfig::addPreSched2() {
  // info.
  addPass(createAArch64SpeculationHardeningPass());

+  addPass(createAArch64IndirectThunks());
  addPass(createAArch64SLSHardeningPass());

  if (TM->getOptLevel() != CodeGenOpt::None) {
--- a/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@ -773,17 +773,17 @@ bool AArch64CallLowering::isEligibleForTailCallOptimization(
  return true;
 }

-static unsigned getCallOpcode(const Function &CallerF, bool IsIndirect,
+static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
                              bool IsTailCall) {
  if (!IsTailCall)
-    return IsIndirect ? AArch64::BLR : AArch64::BL;
+    return IsIndirect ? getBLRCallOpcode(CallerF) : AArch64::BL;

  if (!IsIndirect)
    return AArch64::TCRETURNdi;

  // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use
  // x16 or x17.
-  if (CallerF.hasFnAttribute("branch-target-enforcement"))
+  if (CallerF.getFunction().hasFnAttribute("branch-target-enforcement"))
    return AArch64::TCRETURNriBTI;

  return AArch64::TCRETURNri;
@ -819,7 +819,7 @@ bool AArch64CallLowering::lowerTailCall(
  if (!IsSibCall)
    CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);

-  unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), true);
+  unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true);
  auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
  MIB.add(Info.Callee);

@ -979,7 +979,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,

  // Create a temporarily-floating call instruction so we can add the implicit
  // uses of arg registers.
-  unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), false);
+  unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);

  auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
  MIB.add(Info.Callee);
--- a/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@ -2890,7 +2890,7 @@ bool AArch64InstructionSelector::selectTLSGlobalValue(
  // TLS calls preserve all registers except those that absolutely must be
  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
  // silly).
-  MIB.buildInstr(AArch64::BLR, {}, {Load})
+  MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
      .addDef(AArch64::X0, RegState::Implicit)
      .addRegMask(TRI.getTLSCallPreservedMask());

--- a/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/test/CodeGen/AArch64/O0-pipeline.ll
@ -55,6 +55,7 @@
 ; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
 ; CHECK-NEXT:       AArch64 pseudo instruction expansion pass
 ; CHECK-NEXT:       AArch64 speculation hardening pass
+; CHECK-NEXT:       AArch64 Indirect Thunks
 ; CHECK-NEXT:       AArch64 sls hardening pass
 ; CHECK-NEXT:       Analyze Machine Code For Garbage Collection
 ; CHECK-NEXT:       Insert fentry calls
--- a/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/test/CodeGen/AArch64/O3-pipeline.ll
@ -178,6 +178,7 @@
 ; CHECK-NEXT:       AArch64 pseudo instruction expansion pass
 ; CHECK-NEXT:       AArch64 load / store optimization pass
 ; CHECK-NEXT:       AArch64 speculation hardening pass
+; CHECK-NEXT:       AArch64 Indirect Thunks
 ; CHECK-NEXT:       AArch64 sls hardening pass
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       Machine Natural Loop Construction
--- a/test/CodeGen/AArch64/speculation-hardening-sls-blr.mir
+++ b/test/CodeGen/AArch64/speculation-hardening-sls-blr.mir
@ -0,0 +1,58 @@
+# RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu \
+# RUN:     -start-before aarch64-sls-hardening \
+# RUN:     -stop-after aarch64-sls-hardening -o - %s \
+# RUN:   | FileCheck %s --check-prefixes=CHECK
+
+# Check that the BLR SLS hardening transforms a BLR into a BL with operands as
+# expected.
+--- |
+  $__llvm_slsblr_thunk_x8 = comdat any
+  @a = dso_local local_unnamed_addr global i32 (...)* null, align 8
+  @b = dso_local local_unnamed_addr global i32 0, align 4
+
+  define dso_local void @fn1() local_unnamed_addr "target-features"="+harden-sls-blr" {
+  entry:
+    %0 = load i32 ()*, i32 ()** bitcast (i32 (...)** @a to i32 ()**), align 8
+    %call = tail call i32 %0() nounwind
+    store i32 %call, i32* @b, align 4
+    ret void
+  }
+
+  ; Function Attrs: naked nounwind
+  define linkonce_odr hidden void @__llvm_slsblr_thunk_x8() naked nounwind comdat {
+  entry:
+    ret void
+  }
+...
+---
+name:            fn1
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: fn1
+  bb.0.entry:
+    liveins: $lr
+
+    early-clobber $sp = frame-setup STRXpre killed $lr, $sp, -16 ; :: (store 8 into %stack.0)
+    frame-setup CFI_INSTRUCTION def_cfa_offset 16
+    frame-setup CFI_INSTRUCTION offset $w30, -16
+    renamable $x8 = ADRP target-flags(aarch64-page) @a
+    renamable $x8 = LDRXui killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @a :: (dereferenceable load 8 from `i32 ()** bitcast (i32 (...)** @a to i32 ()**)`)
+    BLRNoIP killed renamable $x8, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+  ; CHECK:  BL <mcsymbol __llvm_slsblr_thunk_x8>, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0, implicit killed $x8
+    renamable $x8 = ADRP target-flags(aarch64-page) @b
+    STRWui killed renamable $w0, killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @b :: (store 4 into @b)
+    early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 ; :: (load 8 from %stack.0)
+    RET undef $lr
+
+
+...
+---
+name:            __llvm_slsblr_thunk_x8
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x8
+
+    BR $x8
+    SpeculationBarrierISBDSBEndBB
+...
--- a/test/CodeGen/AArch64/speculation-hardening-sls.ll
+++ b/test/CodeGen/AArch64/speculation-hardening-sls.ll
@ -1,5 +1,6 @@
-; RUN: llc -mattr=harden-sls-retbr -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,ISBDSB
-; RUN: llc -mattr=harden-sls-retbr -mattr=+sb -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,SB
+; RUN: llc -mattr=harden-sls-retbr,harden-sls-blr -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,HARDEN,ISBDSB
+; RUN: llc -mattr=harden-sls-retbr,harden-sls-blr -mattr=+sb -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,HARDEN,SB
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,NOHARDEN


 ; Function Attrs: norecurse nounwind readnone
@ -24,33 +25,39 @@ if.else:                                          ; preds = %entry
 ; ISBDSB-NEXT: dsb sy
 ; ISBDSB-NEXT: isb
 ; SB-NEXT:     {{ sb$}}
+; CHECK-NEXT: .Lfunc_end
 }

@__const.indirect_branch.ptr = private unnamed_addr constant [2 x i8*] [i8* blockaddress(@indirect_branch, %return), i8* blockaddress(@indirect_branch, %l2)], align 8

 ; Function Attrs: norecurse nounwind readnone
 define dso_local i32 @indirect_branch(i32 %a, i32 %b, i32 %i) {
+; CHECK-LABEL: indirect_branch:
 entry:
  %idxprom = sext i32 %i to i64
  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @__const.indirect_branch.ptr, i64 0, i64 %idxprom
  %0 = load i8*, i8** %arrayidx, align 8
  indirectbr i8* %0, [label %return, label %l2]
-
-l2:                                               ; preds = %entry
-  br label %return
-
-return:                                           ; preds = %entry, %l2
-  %retval.0 = phi i32 [ 1, %l2 ], [ 0, %entry ]
-  ret i32 %retval.0
-; CHECK-LABEL: indirect_branch:
 ; CHECK:       br x
 ; ISBDSB-NEXT: dsb sy
 ; ISBDSB-NEXT: isb
 ; SB-NEXT:     {{ sb$}}
+
+l2:                                               ; preds = %entry
+  br label %return
 ; CHECK:       {{ret$}}
 ; ISBDSB-NEXT: dsb sy
 ; ISBDSB-NEXT: isb
 ; SB-NEXT:     {{ sb$}}
+
+return:                                           ; preds = %entry, %l2
+  %retval.0 = phi i32 [ 1, %l2 ], [ 0, %entry ]
+  ret i32 %retval.0
+; CHECK:       {{ret$}}
+; ISBDSB-NEXT: dsb sy
+; ISBDSB-NEXT: isb
+; SB-NEXT:     {{ sb$}}
+; CHECK-NEXT: .Lfunc_end
 }

 ; Check that RETAA and RETAB instructions are also protected as expected.
@ -61,6 +68,7 @@ entry:
 ; ISBDSB-NEXT: dsb sy
 ; ISBDSB-NEXT: isb
 ; SB-NEXT:     {{ sb$}}
+; CHECK-NEXT: .Lfunc_end
 	  ret i32 %a
 }

@ -71,6 +79,7 @@ entry:
 ; ISBDSB-NEXT: dsb sy
 ; ISBDSB-NEXT: isb
 ; SB-NEXT:     {{ sb$}}
+; CHECK-NEXT: .Lfunc_end
 	  ret i32 %a
 }

@ -102,3 +111,72 @@ d:                             ; preds = %asm.fallthrough, %entry
 ; SB-NEXT:     {{ sb$}}
 ; CHECK-NEXT: .Lfunc_end
 }
+
+define dso_local i32 @indirect_call(
+i32 (...)* nocapture %f1, i32 (...)* nocapture %f2) {
+entry:
+; CHECK-LABEL: indirect_call:
+  %callee.knr.cast = bitcast i32 (...)* %f1 to i32 ()*
+  %call = tail call i32 %callee.knr.cast()
+; HARDEN: bl {{__llvm_slsblr_thunk_x[0-9]+$}}
+  %callee.knr.cast1 = bitcast i32 (...)* %f2 to i32 ()*
+  %call2 = tail call i32 %callee.knr.cast1()
+; HARDEN: bl {{__llvm_slsblr_thunk_x[0-9]+$}}
+  %add = add nsw i32 %call2, %call
+  ret i32 %add
+; CHECK: .Lfunc_end
+}
+
+; verify calling through a function pointer.
+@a = dso_local local_unnamed_addr global i32 (...)* null, align 8
+@b = dso_local local_unnamed_addr global i32 0, align 4
+define dso_local void @indirect_call_global() local_unnamed_addr {
+; CHECK-LABEL: indirect_call_global:
+entry:
+  %0 = load i32 ()*, i32 ()** bitcast (i32 (...)** @a to i32 ()**), align 8
+  %call = tail call i32 %0()  nounwind
+; HARDEN: bl {{__llvm_slsblr_thunk_x[0-9]+$}}
+  store i32 %call, i32* @b, align 4
+  ret void
+; CHECK: .Lfunc_end
+}
+
+; Verify that neither x16 nor x17 are used when the BLR mitigation is enabled,
+; as a linker is allowed to clobber x16 or x17 on calls, which would break the
+; correct execution of the code sequence produced by the mitigation.
+; The below test carefully increases register pressure to persuade code
+; generation to produce a BLR x16. Yes, that is a bit fragile.
+define i64 @check_x16(i64 (i8*, i64, i64, i64, i64, i64, i64, i64)** nocapture readonly %fp, i64 (i8*, i64, i64, i64, i64, i64, i64, i64)** nocapture readonly %fp2) "target-features"="+neon,+reserve-x10,+reserve-x11,+reserve-x12,+reserve-x13,+reserve-x14,+reserve-x15,+reserve-x18,+reserve-x20,+reserve-x21,+reserve-x22,+reserve-x23,+reserve-x24,+reserve-x25,+reserve-x26,+reserve-x27,+reserve-x28,+reserve-x30,+reserve-x9" {
+entry:
+; CHECK-LABEL: check_x16:
+  %0 = load i64 (i8*, i64, i64, i64, i64, i64, i64, i64)*, i64 (i8*, i64, i64, i64, i64, i64, i64, i64)** %fp, align 8
+  %1 = bitcast i64 (i8*, i64, i64, i64, i64, i64, i64, i64)** %fp2 to i8**
+  %2 = load i8*, i8** %1, align 8
+  %call = call i64 %0(i8* %2, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0)
+  %3 = load i64 (i8*, i64, i64, i64, i64, i64, i64, i64)*, i64 (i8*, i64, i64, i64, i64, i64, i64, i64)** %fp2, align 8
+  %4 = bitcast i64 (i8*, i64, i64, i64, i64, i64, i64, i64)** %fp to i8**
+  %5 = load i8*, i8** %4, align 8;, !tbaa !2
+  %call1 = call i64 %3(i8* %5, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0)
+; NOHARDEN:   blr x16
+; ISBDSB-NOT: bl __llvm_slsblr_thunk_x16
+; SB-NOT:     bl __llvm_slsblr_thunk_x16
+; CHECK
+  %add = add nsw i64 %call1, %call
+  ret i64 %add
+; CHECK: .Lfunc_end
+}
+
+; HARDEN-label: __llvm_slsblr_thunk_x0:
+; HARDEN:    br x0
+; ISBDSB-NEXT: dsb sy
+; ISBDSB-NEXT: isb
+; SB-NEXT:     dsb sy
+; SB-NEXT:     isb
+; HARDEN-NEXT: .Lfunc_end
+; HARDEN-label: __llvm_slsblr_thunk_x19:
+; HARDEN:    br x19
+; ISBDSB-NEXT: dsb sy
+; ISBDSB-NEXT: isb
+; SB-NEXT:     dsb sy
+; SB-NEXT:     isb
+; HARDEN-NEXT: .Lfunc_end