1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-24 03:33:20 +01:00

MemTag: unchecked load/store optimization.

Summary:
MTE allows memory access to bypass tag check iff the address argument
is [SP, #imm]. This change takes advantage of this to demote uses of
tagged addresses to regular FrameIndex operands, reducing register
pressure in large functions.

MO_TAGGED target flag is used to signal that the FrameIndex operand
refers to memory that might be tagged, and needs to be handled with
care. Such operand must be lowered to [SP, #imm] directly, without a
scratch register.

The transformation pass attempts to predict when the offset will be
out of range and disable the optimization.
AArch64RegisterInfo::eliminateFrameIndex has an escape hatch in case
this prediction has been wrong, but it is quite inefficient and should
be avoided.

Reviewers: pcc, vitalybuka, ostannard

Subscribers: mgorny, javed.absar, kristof.beyls, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D66457

llvm-svn: 370490
This commit is contained in:
Evgeniy Stepanov 2019-08-30 17:23:02 +00:00
parent dfd3a1d37d
commit 74f96070c3
8 changed files with 389 additions and 1 deletions

View File

@ -57,6 +57,7 @@ createAArch64InstructionSelector(const AArch64TargetMachine &,
AArch64Subtarget &, AArch64RegisterBankInfo &); AArch64Subtarget &, AArch64RegisterBankInfo &);
FunctionPass *createAArch64PreLegalizeCombiner(); FunctionPass *createAArch64PreLegalizeCombiner();
FunctionPass *createAArch64StackTaggingPass(bool MergeInit); FunctionPass *createAArch64StackTaggingPass(bool MergeInit);
FunctionPass *createAArch64StackTaggingPreRAPass();
void initializeAArch64A53Fix835769Pass(PassRegistry&); void initializeAArch64A53Fix835769Pass(PassRegistry&);
void initializeAArch64A57FPLoadBalancingPass(PassRegistry&); void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
@ -80,6 +81,7 @@ void initializeFalkorHWPFFixPass(PassRegistry&);
void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
void initializeLDTLSCleanupPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&);
void initializeAArch64StackTaggingPass(PassRegistry&); void initializeAArch64StackTaggingPass(PassRegistry&);
void initializeAArch64StackTaggingPreRAPass(PassRegistry&);
} // end namespace llvm } // end namespace llvm
#endif #endif

View File

@ -447,11 +447,14 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MachineInstr &MI = *II; MachineInstr &MI = *II;
MachineBasicBlock &MBB = *MI.getParent(); MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent(); MachineFunction &MF = *MBB.getParent();
const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64InstrInfo *TII = const AArch64InstrInfo *TII =
MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
const AArch64FrameLowering *TFI = getFrameLowering(MF); const AArch64FrameLowering *TFI = getFrameLowering(MF);
int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
bool Tagged =
MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED;
unsigned FrameReg; unsigned FrameReg;
// Special handling of dbg_value, stackmap and patchpoint instructions. // Special handling of dbg_value, stackmap and patchpoint instructions.
@ -477,12 +480,36 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
StackOffset Offset; StackOffset Offset;
if (MI.getOpcode() == AArch64::TAGPstack) { if (MI.getOpcode() == AArch64::TAGPstack) {
// TAGPstack must use the virtual frame register in its 3rd operand. // TAGPstack must use the virtual frame register in its 3rd operand.
const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
FrameReg = MI.getOperand(3).getReg(); FrameReg = MI.getOperand(3).getReg();
Offset = {MFI.getObjectOffset(FrameIndex) + Offset = {MFI.getObjectOffset(FrameIndex) +
AFI->getTaggedBasePointerOffset(), AFI->getTaggedBasePointerOffset(),
MVT::i8}; MVT::i8};
} else if (Tagged) {
StackOffset SPOffset = {
MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), MVT::i8};
if (MFI.hasVarSizedObjects() ||
isAArch64FrameOffsetLegal(MI, SPOffset, nullptr, nullptr, nullptr) !=
(AArch64FrameOffsetCanUpdate | AArch64FrameOffsetIsLegal)) {
// Can't update to SP + offset in place. Precalculate the tagged pointer
// in a scratch register.
Offset = TFI->resolveFrameIndexReference(
MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
Register ScratchReg =
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset,
TII);
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg)
.addReg(ScratchReg)
.addReg(ScratchReg)
.addImm(0);
MI.getOperand(FIOperandNum)
.ChangeToRegister(ScratchReg, false, false, true);
return;
}
FrameReg = AArch64::SP;
Offset = {MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(),
MVT::i8};
} else { } else {
Offset = TFI->resolveFrameIndexReference( Offset = TFI->resolveFrameIndexReference(
MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true); MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);

View File

@ -0,0 +1,209 @@
//===-- AArch64StackTaggingPreRA.cpp --- Stack Tagging for AArch64 -----===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "AArch64.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64InstrInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineTraceMetrics.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
#define DEBUG_TYPE "aarch64-stack-tagging-pre-ra"
enum UncheckedLdStMode { UncheckedNever, UncheckedSafe, UncheckedAlways };
cl::opt<UncheckedLdStMode> ClUncheckedLdSt(
"stack-tagging-unchecked-ld-st", cl::Hidden,
cl::init(UncheckedSafe),
cl::desc(
"Unconditionally apply unchecked-ld-st optimization (even for large "
"stack frames, or in the presence of variable sized allocas)."),
cl::values(
clEnumValN(UncheckedNever, "never", "never apply unchecked-ld-st"),
clEnumValN(
UncheckedSafe, "safe",
"apply unchecked-ld-st when the target is definitely within range"),
clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st")));
namespace {
class AArch64StackTaggingPreRA : public MachineFunctionPass {
MachineFunction *MF;
AArch64FunctionInfo *AFI;
MachineFrameInfo *MFI;
MachineRegisterInfo *MRI;
const AArch64RegisterInfo *TRI;
const AArch64InstrInfo *TII;
SmallVector<MachineInstr*, 16> ReTags;
public:
static char ID;
AArch64StackTaggingPreRA() : MachineFunctionPass(ID) {
initializeAArch64StackTaggingPreRAPass(*PassRegistry::getPassRegistry());
}
bool mayUseUncheckedLoadStore();
void uncheckUsesOf(unsigned TaggedReg, int FI);
void uncheckLoadsAndStores();
bool runOnMachineFunction(MachineFunction &Func) override;
StringRef getPassName() const override {
return "AArch64 Stack Tagging PreRA";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
} // end anonymous namespace
char AArch64StackTaggingPreRA::ID = 0;
INITIALIZE_PASS_BEGIN(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra",
"AArch64 Stack Tagging PreRA Pass", false, false)
INITIALIZE_PASS_END(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra",
"AArch64 Stack Tagging PreRA Pass", false, false)
FunctionPass *llvm::createAArch64StackTaggingPreRAPass() {
return new AArch64StackTaggingPreRA();
}
static bool isUncheckedLoadOrStoreOpcode(unsigned Opcode) {
switch (Opcode) {
case AArch64::LDRWui:
case AArch64::LDRSHWui:
case AArch64::LDRXui:
case AArch64::LDRBui:
case AArch64::LDRBBui:
case AArch64::LDRHui:
case AArch64::LDRSui:
case AArch64::LDRDui:
case AArch64::LDRQui:
case AArch64::STRWui:
case AArch64::STRXui:
case AArch64::STRBui:
case AArch64::STRBBui:
case AArch64::STRHui:
case AArch64::STRSui:
case AArch64::STRDui:
case AArch64::STRQui:
return true;
default:
return false;
}
}
bool AArch64StackTaggingPreRA::mayUseUncheckedLoadStore() {
if (ClUncheckedLdSt == UncheckedNever)
return false;
else if (ClUncheckedLdSt == UncheckedAlways)
return true;
// This estimate can be improved if we had harder guarantees about stack frame
// layout. With LocalStackAllocation we can estimate SP offset to any
// preallocated slot. AArch64FrameLowering::orderFrameObjects could put tagged
// objects ahead of non-tagged ones, but that's not always desirable.
//
// Underestimating SP offset here may require the use of LDG to materialize
// the tagged address of the stack slot, along with a scratch register
// allocation (post-regalloc!).
//
// For now we do the safe thing here and require that the entire stack frame
// is within range of the shortest of the unchecked instructions.
unsigned FrameSize = 0;
for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i)
FrameSize += MFI->getObjectSize(i);
bool EntireFrameReachableFromSP = FrameSize < 0xf00;
return !MFI->hasVarSizedObjects() && EntireFrameReachableFromSP;
}
void AArch64StackTaggingPreRA::uncheckUsesOf(unsigned TaggedReg, int FI) {
for (auto UI = MRI->use_instr_begin(TaggedReg), E = MRI->use_instr_end();
UI != E;) {
MachineInstr *UseI = &*(UI++);
if (isUncheckedLoadOrStoreOpcode(UseI->getOpcode())) {
// FI operand is always the one before the immediate offset.
unsigned OpIdx = TII->getLoadStoreImmIdx(UseI->getOpcode()) - 1;
if (UseI->getOperand(OpIdx).isReg() &&
UseI->getOperand(OpIdx).getReg() == TaggedReg) {
UseI->getOperand(OpIdx).ChangeToFrameIndex(FI);
UseI->getOperand(OpIdx).setTargetFlags(AArch64II::MO_TAGGED);
}
} else if (UseI->isCopy() &&
Register::isVirtualRegister(UseI->getOperand(0).getReg())) {
uncheckUsesOf(UseI->getOperand(0).getReg(), FI);
}
}
}
void AArch64StackTaggingPreRA::uncheckLoadsAndStores() {
for (auto *I : ReTags) {
unsigned TaggedReg = I->getOperand(0).getReg();
int FI = I->getOperand(1).getIndex();
uncheckUsesOf(TaggedReg, FI);
}
}
bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
MF = &Func;
MRI = &MF->getRegInfo();
AFI = MF->getInfo<AArch64FunctionInfo>();
TII = static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo());
TRI = static_cast<const AArch64RegisterInfo *>(
MF->getSubtarget().getRegisterInfo());
MFI = &MF->getFrameInfo();
ReTags.clear();
assert(MRI->isSSA());
LLVM_DEBUG(dbgs() << "********** AArch64 Stack Tagging PreRA **********\n"
<< "********** Function: " << MF->getName() << '\n');
SmallSetVector<int, 8> TaggedSlots;
for (auto &BB : *MF) {
for (auto &I : BB) {
if (I.getOpcode() == AArch64::TAGPstack) {
ReTags.push_back(&I);
int FI = I.getOperand(1).getIndex();
TaggedSlots.insert(FI);
// There should be no offsets in TAGP yet.
assert(I.getOperand(2).getImm() == 0);
}
}
}
if (ReTags.empty())
return false;
if (mayUseUncheckedLoadStore())
uncheckLoadsAndStores();
return true;
}

View File

@ -180,6 +180,7 @@ extern "C" void LLVMInitializeAArch64Target() {
initializeLDTLSCleanupPass(*PR); initializeLDTLSCleanupPass(*PR);
initializeAArch64SpeculationHardeningPass(*PR); initializeAArch64SpeculationHardeningPass(*PR);
initializeAArch64StackTaggingPass(*PR); initializeAArch64StackTaggingPass(*PR);
initializeAArch64StackTaggingPreRAPass(*PR);
} }
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
@ -541,6 +542,8 @@ bool AArch64PassConfig::addILPOpts() {
if (EnableStPairSuppress) if (EnableStPairSuppress)
addPass(createAArch64StorePairSuppressPass()); addPass(createAArch64StorePairSuppressPass());
addPass(createAArch64SIMDInstrOptPass()); addPass(createAArch64SIMDInstrOptPass());
if (TM->getOptLevel() != CodeGenOpt::None)
addPass(createAArch64StackTaggingPreRAPass());
return true; return true;
} }

View File

@ -56,6 +56,7 @@ add_llvm_target(AArch64CodeGen
AArch64SelectionDAGInfo.cpp AArch64SelectionDAGInfo.cpp
AArch64SpeculationHardening.cpp AArch64SpeculationHardening.cpp
AArch64StackTagging.cpp AArch64StackTagging.cpp
AArch64StackTaggingPreRA.cpp
AArch64StorePairSuppress.cpp AArch64StorePairSuppress.cpp
AArch64Subtarget.cpp AArch64Subtarget.cpp
AArch64TargetMachine.cpp AArch64TargetMachine.cpp

View File

@ -635,6 +635,10 @@ namespace AArch64II {
/// MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag /// MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag
/// in bits 56-63. /// in bits 56-63.
/// On a FrameIndex operand, indicates that the underlying memory is tagged
/// with an unknown tag value (MTE); this needs to be lowered either to an
/// SP-relative load or store instruction (which do not check tags), or to
/// an LDG instruction to obtain the tag value.
MO_TAGGED = 0x400, MO_TAGGED = 0x400,
}; };
} // end namespace AArch64II } // end namespace AArch64II

View File

@ -97,6 +97,7 @@
; CHECK-NEXT: Early If-Conversion ; CHECK-NEXT: Early If-Conversion
; CHECK-NEXT: AArch64 Store Pair Suppression ; CHECK-NEXT: AArch64 Store Pair Suppression
; CHECK-NEXT: AArch64 SIMD instructions optimization pass ; CHECK-NEXT: AArch64 SIMD instructions optimization pass
; CHECK-NEXT: AArch64 Stack Tagging PreRA
; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Machine Natural Loop Construction
; CHECK-NEXT: Early Machine Loop Invariant Code Motion ; CHECK-NEXT: Early Machine Loop Invariant Code Motion

View File

@ -0,0 +1,141 @@
; RUN: llc < %s -mtriple=aarch64 -mattr=+mte | FileCheck %s --check-prefixes=DEFAULT,COMMON
; RUN: llc < %s -mtriple=aarch64 -mattr=+mte -stack-tagging-unchecked-ld-st=never | FileCheck %s --check-prefixes=NEVER,COMMON
; RUN: llc < %s -mtriple=aarch64 -mattr=+mte -stack-tagging-unchecked-ld-st=always | FileCheck %s --check-prefixes=ALWAYS,COMMON
declare void @use8(i8*)
declare void @use32(i32*)
declare void @use2x64([2 x i64]*)
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
define i32 @CallLd() sanitize_memtag {
entry:
%x = alloca i32, align 4
call void @use32(i32* %x)
%a = load i32, i32* %x
ret i32 %a
}
; COMMON: CallLd:
; COMMON: bl use32
; ALWAYS: ldr w0, [sp]
; DEFAULT: ldr w0, [sp]
; NEVER: ldr w0, [x{{.*}}]
; COMMON: ret
define void @CallStCall() sanitize_memtag {
entry:
%x = alloca i32, align 4
call void @use32(i32* %x)
store i32 42, i32* %x
call void @use32(i32* %x)
ret void
}
; COMMON: CallStCall:
; COMMON: bl use32
; ALWAYS: str w{{.*}}, [sp]
; DEFAULT: str w{{.*}}, [sp]
; NEVER: str w{{.*}}, [x{{.*}}]
; COMMON: bl use32
; COMMON: ret
define void @CallStPair(i64 %z) sanitize_memtag {
entry:
%x = alloca [2 x i64], align 8
call void @use2x64([2 x i64]* %x)
%x0 = getelementptr inbounds [2 x i64], [2 x i64]* %x, i64 0, i64 0
store i64 %z, i64* %x0, align 8
%x1 = getelementptr inbounds [2 x i64], [2 x i64]* %x, i64 0, i64 1
store i64 %z, i64* %x1, align 8
call void @use2x64([2 x i64]* %x)
ret void
}
; COMMON: CallStPair:
; COMMON: bl use2x64
; ALWAYS: stp {{.*}}, [sp]
; DEFAULT: stp {{.*}}, [sp]
; NEVER: stp {{.*}}, [x{{.*}}]
; COMMON: bl use2x64
; COMMON: ret
; One of the two allocas will end up out of range of ldrb [sp].
define dso_local i8 @LargeFrame() sanitize_memtag {
entry:
%x = alloca [4096 x i8], align 4
%y = alloca [4096 x i8], align 4
%0 = getelementptr inbounds [4096 x i8], [4096 x i8]* %x, i64 0, i64 0
%1 = getelementptr inbounds [4096 x i8], [4096 x i8]* %y, i64 0, i64 0
call void @use8(i8* %0)
call void @use8(i8* %1)
%2 = load i8, i8* %0, align 4
%3 = load i8, i8* %1, align 4
%add = add i8 %3, %2
ret i8 %add
}
; COMMON: LargeFrame:
; COMMON: bl use8
; COMMON: bl use8
; NEVER: ldrb [[A:w.*]], [x{{.*}}]
; NEVER: ldrb [[B:w.*]], [x{{.*}}]
; DEFAULT: ldrb [[A:w.*]], [x{{.*}}]
; DEFAULT: ldrb [[B:w.*]], [x{{.*}}]
; ALWAYS: ldg [[PA:x.*]], [x{{.*}}]
; ALWAYS: ldrb [[B:w.*]], [sp]
; ALWAYS: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}}
; COMMON: add w0, [[B]], [[A]]
; COMMON: ret
; One of these allocas is closer to FP than to SP, and within 256 bytes
; of the former (see hardcoded limit in resolveFrameOffsetReference).
; It could be lowered to an FP-relative load, but not when doing an
; unchecked access to tagged memory!
define i8 @FPOffset() "frame-pointer"="all" sanitize_memtag {
%x = alloca [200 x i8], align 4
%y = alloca [200 x i8], align 4
%z = alloca [200 x i8], align 4
%x0 = getelementptr inbounds [200 x i8], [200 x i8]* %x, i64 0, i64 0
%y0 = getelementptr inbounds [200 x i8], [200 x i8]* %y, i64 0, i64 0
%z0 = getelementptr inbounds [200 x i8], [200 x i8]* %z, i64 0, i64 0
call void @use8(i8* %x0)
call void @use8(i8* %y0)
call void @use8(i8* %z0)
%x1 = load i8, i8* %x0, align 4
%y1 = load i8, i8* %y0, align 4
%z1 = load i8, i8* %z0, align 4
%a = add i8 %x1, %y1
%b = add i8 %a, %z1
ret i8 %b
}
; COMMON: FPOffset:
; COMMON: bl use8
; COMMON: bl use8
; COMMON: bl use8
; All three loads are SP-based.
; ALWAYS-DAG: ldrb w{{.*}}, [sp, #416]
; ALWAYS-DAG: ldrb w{{.*}}, [sp, #208]
; ALWAYS-DAG: ldrb w{{.*}}, [sp]
; DEFAULT-DAG: ldrb w{{.*}}, [sp, #416]
; DEFAULT-DAG: ldrb w{{.*}}, [sp, #208]
; DEFAULT-DAG: ldrb w{{.*}}, [sp]
; NEVER-DAG: ldrb w{{.*}}, [x{{.*}}]
; NEVER-DAG: ldrb w{{.*}}, [x{{.*}}]
; NEVER-DAG: ldrb w{{.*}}, [x{{.*}}]
; COMMON: ret