mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 04:02:41 +01:00
[WebAssembly] Enhanced register stackification
This patch revamps the RegStackifier pass with a new tree traversal mechanism, enabling three major new features: - Stackification of values with multiple uses, using the result value of set_local - More aggressive stackification of instructions with side effects - Reordering operands in commutative instructions to enable more stackification. llvm-svn: 259009
This commit is contained in:
parent
68851f60b4
commit
2e8acd83fd
@ -343,6 +343,14 @@ public:
|
||||
return make_range(operands_begin() + getDesc().getNumDefs(),
|
||||
operands_end());
|
||||
}
|
||||
iterator_range<mop_iterator> explicit_uses() {
|
||||
return make_range(operands_begin() + getDesc().getNumDefs(),
|
||||
operands_begin() + getNumExplicitOperands() );
|
||||
}
|
||||
iterator_range<const_mop_iterator> explicit_uses() const {
|
||||
return make_range(operands_begin() + getDesc().getNumDefs(),
|
||||
operands_begin() + getNumExplicitOperands() );
|
||||
}
|
||||
|
||||
/// Returns the number of the operand iterator \p I points to.
|
||||
unsigned getOperandNo(const_mop_iterator I) const {
|
||||
|
@ -32,12 +32,6 @@ Interesting work that remains to be done:
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
set_local instructions have a return value. We should (a) model this,
|
||||
and (b) write optimizations which take advantage of it. Keep in mind that
|
||||
many set_local instructions are implicit!
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Br, br_if, and tableswitch instructions can support having a value on the
|
||||
expression stack across the jump (sometimes). We should (a) model this, and
|
||||
(b) extend the stackifier to utilize it.
|
||||
@ -87,3 +81,17 @@ Find a clean way to fix the problem which leads to the Shrink Wrapping pass
|
||||
being run after the WebAssembly PEI pass.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
When setting multiple local variables to the same constant, we currently get
|
||||
code like this:
|
||||
|
||||
i32.const $4=, 0
|
||||
i32.const $3=, 0
|
||||
|
||||
It could be done with a smaller encoding like this:
|
||||
|
||||
i32.const $push5=, 0
|
||||
tee_local $push6=, $4=, $pop5
|
||||
copy_local $3=, $pop6
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
@ -15,6 +15,7 @@
|
||||
|
||||
#include "WebAssemblyInstrInfo.h"
|
||||
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
|
||||
#include "WebAssemblyMachineFunctionInfo.h"
|
||||
#include "WebAssemblySubtarget.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
@ -75,6 +76,21 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
||||
.addReg(SrcReg, KillSrc ? RegState::Kill : 0);
|
||||
}
|
||||
|
||||
MachineInstr *
|
||||
WebAssemblyInstrInfo::commuteInstructionImpl(MachineInstr *MI, bool NewMI,
|
||||
unsigned OpIdx1,
|
||||
unsigned OpIdx2) const {
|
||||
// If the operands are stackified, we can't reorder them.
|
||||
WebAssemblyFunctionInfo &MFI =
|
||||
*MI->getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>();
|
||||
if (MFI.isVRegStackified(MI->getOperand(OpIdx1).getReg()) ||
|
||||
MFI.isVRegStackified(MI->getOperand(OpIdx2).getReg()))
|
||||
return nullptr;
|
||||
|
||||
// Otherwise use the default implementation.
|
||||
return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
|
||||
}
|
||||
|
||||
// Branch analysis.
|
||||
bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock *&TBB,
|
||||
|
@ -40,6 +40,9 @@ public:
|
||||
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
|
||||
DebugLoc DL, unsigned DestReg, unsigned SrcReg,
|
||||
bool KillSrc) const override;
|
||||
MachineInstr *commuteInstructionImpl(MachineInstr *MI, bool NewMI,
|
||||
unsigned OpIdx1,
|
||||
unsigned OpIdx2) const override;
|
||||
|
||||
bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
|
||||
MachineBasicBlock *&FBB,
|
||||
|
@ -107,15 +107,8 @@ defm : ARGUMENT<F64>;
|
||||
let Defs = [ARGUMENTS] in {
|
||||
|
||||
// get_local and set_local are not generated by instruction selection; they
|
||||
// are implied by virtual register uses and defs in most contexts. However,
|
||||
// they are explicitly emitted for special purposes.
|
||||
// are implied by virtual register uses and defs.
|
||||
multiclass LOCAL<WebAssemblyRegClass vt> {
|
||||
def GET_LOCAL_#vt : I<(outs vt:$res), (ins i32imm:$regno), [],
|
||||
"get_local\t$res, $regno">;
|
||||
// TODO: set_local returns its operand value
|
||||
def SET_LOCAL_#vt : I<(outs), (ins i32imm:$regno, vt:$src), [],
|
||||
"set_local\t$regno, $src">;
|
||||
|
||||
// COPY_LOCAL is not an actual instruction in wasm, but since we allow
|
||||
// get_local and set_local to be implicit, we can have a COPY_LOCAL which
|
||||
// is actually a no-op because all the work is done in the implied
|
||||
@ -123,6 +116,13 @@ multiclass LOCAL<WebAssemblyRegClass vt> {
|
||||
let isAsCheapAsAMove = 1 in
|
||||
def COPY_LOCAL_#vt : I<(outs vt:$res), (ins vt:$src), [],
|
||||
"copy_local\t$res, $src">;
|
||||
|
||||
// TEE_LOCAL is similar to COPY_LOCAL, but writes two copies of its result.
|
||||
// Typically this would be used to stackify one result and write the other
|
||||
// result to a local.
|
||||
let isAsCheapAsAMove = 1 in
|
||||
def TEE_LOCAL_#vt : I<(outs vt:$res, vt:$also), (ins vt:$src), [],
|
||||
"tee_local\t$res, $also, $src">;
|
||||
}
|
||||
defm : LOCAL<I32>;
|
||||
defm : LOCAL<I64>;
|
||||
|
@ -27,6 +27,8 @@
|
||||
#include "llvm/Analysis/AliasAnalysis.h"
|
||||
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
|
||||
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
|
||||
#include "llvm/CodeGen/MachineDominators.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
@ -44,12 +46,13 @@ class WebAssemblyRegStackify final : public MachineFunctionPass {
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
AU.addRequired<AAResultsWrapperPass>();
|
||||
AU.addRequired<MachineDominatorTree>();
|
||||
AU.addRequired<LiveIntervals>();
|
||||
AU.addPreserved<MachineBlockFrequencyInfo>();
|
||||
AU.addPreserved<SlotIndexes>();
|
||||
AU.addPreserved<LiveIntervals>();
|
||||
AU.addPreservedID(MachineDominatorsID);
|
||||
AU.addPreservedID(LiveVariablesID);
|
||||
AU.addPreserved<MachineDominatorTree>();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
|
||||
@ -89,8 +92,8 @@ static void ImposeStackOrdering(MachineInstr *MI) {
|
||||
// TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
|
||||
// more precise.
|
||||
static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
|
||||
AliasAnalysis &AA, LiveIntervals &LIS,
|
||||
MachineRegisterInfo &MRI) {
|
||||
AliasAnalysis &AA, const LiveIntervals &LIS,
|
||||
const MachineRegisterInfo &MRI) {
|
||||
assert(Def->getParent() == Insert->getParent());
|
||||
bool SawStore = false, SawSideEffects = false;
|
||||
MachineBasicBlock::const_iterator D(Def), I(Insert);
|
||||
@ -133,6 +136,250 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
|
||||
!(SawSideEffects && !Def->isSafeToMove(&AA, SawStore));
|
||||
}
|
||||
|
||||
/// Test whether OneUse, a use of Reg, dominates all of Reg's other uses.
|
||||
static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
|
||||
const MachineBasicBlock &MBB,
|
||||
const MachineRegisterInfo &MRI,
|
||||
const MachineDominatorTree &MDT) {
|
||||
for (const MachineOperand &Use : MRI.use_operands(Reg)) {
|
||||
if (&Use == &OneUse)
|
||||
continue;
|
||||
const MachineInstr *UseInst = Use.getParent();
|
||||
const MachineInstr *OneUseInst = OneUse.getParent();
|
||||
if (UseInst->getOpcode() == TargetOpcode::PHI) {
|
||||
// Test that the PHI use, which happens on the CFG edge rather than
|
||||
// within the PHI's own block, is dominated by the one selected use.
|
||||
const MachineBasicBlock *Pred =
|
||||
UseInst->getOperand(&Use - &UseInst->getOperand(0) + 1).getMBB();
|
||||
if (!MDT.dominates(&MBB, Pred))
|
||||
return false;
|
||||
} else if (UseInst == OneUseInst) {
|
||||
// Another use in the same instruction. We need to ensure that the one
|
||||
// selected use happens "before" it.
|
||||
if (&OneUse > &Use)
|
||||
return false;
|
||||
} else {
|
||||
// Test that the use is dominated by the one selected use.
|
||||
if (!MDT.dominates(OneUseInst, UseInst))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Get the appropriate tee_local opcode for the given register class.
|
||||
static unsigned GetTeeLocalOpcode(const TargetRegisterClass *RC) {
|
||||
if (RC == &WebAssembly::I32RegClass)
|
||||
return WebAssembly::TEE_LOCAL_I32;
|
||||
if (RC == &WebAssembly::I64RegClass)
|
||||
return WebAssembly::TEE_LOCAL_I64;
|
||||
if (RC == &WebAssembly::F32RegClass)
|
||||
return WebAssembly::TEE_LOCAL_F32;
|
||||
if (RC == &WebAssembly::F64RegClass)
|
||||
return WebAssembly::TEE_LOCAL_F64;
|
||||
llvm_unreachable("Unexpected register class");
|
||||
}
|
||||
|
||||
/// A single-use def in the same block with no intervening memory or register
|
||||
/// dependencies; move the def down and nest it with the current instruction.
|
||||
static MachineInstr *MoveForSingleUse(unsigned Reg, MachineInstr *Def,
|
||||
MachineBasicBlock &MBB,
|
||||
MachineInstr *Insert, LiveIntervals &LIS,
|
||||
WebAssemblyFunctionInfo &MFI) {
|
||||
MBB.splice(Insert, &MBB, Def);
|
||||
LIS.handleMove(Def);
|
||||
MFI.stackifyVReg(Reg);
|
||||
ImposeStackOrdering(Def);
|
||||
return Def;
|
||||
}
|
||||
|
||||
/// A trivially cloneable instruction; clone it and nest the new copy with the
|
||||
/// current instruction.
|
||||
static MachineInstr *
|
||||
RematerializeCheapDef(unsigned Reg, MachineOperand &Op, MachineInstr *Def,
|
||||
MachineBasicBlock &MBB, MachineInstr *Insert,
|
||||
LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
|
||||
MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII,
|
||||
const WebAssemblyRegisterInfo *TRI) {
|
||||
unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
|
||||
TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI);
|
||||
Op.setReg(NewReg);
|
||||
MachineInstr *Clone = &*std::prev(MachineBasicBlock::instr_iterator(Insert));
|
||||
LIS.InsertMachineInstrInMaps(Clone);
|
||||
LIS.createAndComputeVirtRegInterval(NewReg);
|
||||
MFI.stackifyVReg(NewReg);
|
||||
ImposeStackOrdering(Clone);
|
||||
|
||||
// If that was the last use of the original, delete the original.
|
||||
// Otherwise shrink the LiveInterval.
|
||||
if (MRI.use_empty(Reg)) {
|
||||
SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot();
|
||||
LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
|
||||
LIS.removeVRegDefAt(LIS.getInterval(Reg), Idx);
|
||||
LIS.removeInterval(Reg);
|
||||
LIS.RemoveMachineInstrFromMaps(Def);
|
||||
Def->eraseFromParent();
|
||||
} else {
|
||||
LIS.shrinkToUses(&LIS.getInterval(Reg));
|
||||
}
|
||||
return Clone;
|
||||
}
|
||||
|
||||
/// A multiple-use def in the same block with no intervening memory or register
|
||||
/// dependencies; move the def down, nest it with the current instruction, and
|
||||
/// insert a tee_local to satisfy the rest of the uses. As an illustration,
|
||||
/// rewrite this:
|
||||
///
|
||||
/// Reg = INST ... // Def
|
||||
/// INST ..., Reg, ... // Insert
|
||||
/// INST ..., Reg, ...
|
||||
/// INST ..., Reg, ...
|
||||
///
|
||||
/// to this:
|
||||
///
|
||||
/// Reg = INST ... // Def (to become the new Insert)
|
||||
/// TeeReg, NewReg = TEE_LOCAL_... Reg
|
||||
/// INST ..., TeeReg, ... // Insert
|
||||
/// INST ..., NewReg, ...
|
||||
/// INST ..., NewReg, ...
|
||||
///
|
||||
/// with Reg and TeeReg stackified. This eliminates a get_local from the
|
||||
/// resulting code.
|
||||
static MachineInstr *MoveAndTeeForMultiUse(
|
||||
unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB,
|
||||
MachineInstr *Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
|
||||
MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII) {
|
||||
MBB.splice(Insert, &MBB, Def);
|
||||
LIS.handleMove(Def);
|
||||
const auto *RegClass = MRI.getRegClass(Reg);
|
||||
unsigned NewReg = MRI.createVirtualRegister(RegClass);
|
||||
unsigned TeeReg = MRI.createVirtualRegister(RegClass);
|
||||
MRI.replaceRegWith(Reg, NewReg);
|
||||
MachineInstr *Tee = BuildMI(MBB, Insert, Insert->getDebugLoc(),
|
||||
TII->get(GetTeeLocalOpcode(RegClass)), TeeReg)
|
||||
.addReg(NewReg, RegState::Define)
|
||||
.addReg(Reg);
|
||||
Op.setReg(TeeReg);
|
||||
Def->getOperand(0).setReg(Reg);
|
||||
LIS.InsertMachineInstrInMaps(Tee);
|
||||
LIS.shrinkToUses(&LIS.getInterval(Reg));
|
||||
LIS.createAndComputeVirtRegInterval(NewReg);
|
||||
LIS.createAndComputeVirtRegInterval(TeeReg);
|
||||
MFI.stackifyVReg(Reg);
|
||||
MFI.stackifyVReg(TeeReg);
|
||||
ImposeStackOrdering(Def);
|
||||
ImposeStackOrdering(Tee);
|
||||
return Def;
|
||||
}
|
||||
|
||||
namespace {
|
||||
/// A stack for walking the tree of instructions being built, visiting the
|
||||
/// MachineOperands in DFS order.
|
||||
class TreeWalkerState {
|
||||
typedef MachineInstr::mop_iterator mop_iterator;
|
||||
typedef std::reverse_iterator<mop_iterator> mop_reverse_iterator;
|
||||
typedef iterator_range<mop_reverse_iterator> RangeTy;
|
||||
SmallVector<RangeTy, 4> Worklist;
|
||||
|
||||
public:
|
||||
explicit TreeWalkerState(MachineInstr *Insert) {
|
||||
const iterator_range<mop_iterator> &Range = Insert->explicit_uses();
|
||||
if (Range.begin() != Range.end())
|
||||
Worklist.push_back(reverse(Range));
|
||||
}
|
||||
|
||||
bool Done() const { return Worklist.empty(); }
|
||||
|
||||
MachineOperand &Pop() {
|
||||
RangeTy &Range = Worklist.back();
|
||||
MachineOperand &Op = *Range.begin();
|
||||
Range = drop_begin(Range, 1);
|
||||
if (Range.begin() == Range.end())
|
||||
Worklist.pop_back();
|
||||
assert((Worklist.empty() ||
|
||||
Worklist.back().begin() != Worklist.back().end()) &&
|
||||
"Empty ranges shouldn't remain in the worklist");
|
||||
return Op;
|
||||
}
|
||||
|
||||
/// Push Instr's operands onto the stack to be visited.
|
||||
void PushOperands(MachineInstr *Instr) {
|
||||
const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
|
||||
if (Range.begin() != Range.end())
|
||||
Worklist.push_back(reverse(Range));
|
||||
}
|
||||
|
||||
/// Some of Instr's operands are on the top of the stack; remove them and
|
||||
/// re-insert them starting from the beginning (because we've commuted them).
|
||||
void ResetTopOperands(MachineInstr *Instr) {
|
||||
assert(HasRemainingOperands(Instr) &&
|
||||
"Reseting operands should only be done when the instruction has "
|
||||
"an operand still on the stack");
|
||||
Worklist.back() = reverse(Instr->explicit_uses());
|
||||
}
|
||||
|
||||
/// Test whether Instr has operands remaining to be visited at the top of
|
||||
/// the stack.
|
||||
bool HasRemainingOperands(const MachineInstr *Instr) const {
|
||||
if (Worklist.empty())
|
||||
return false;
|
||||
const RangeTy &Range = Worklist.back();
|
||||
return Range.begin() != Range.end() && Range.begin()->getParent() == Instr;
|
||||
}
|
||||
};
|
||||
|
||||
/// State to keep track of whether commuting is in flight or whether it's been
|
||||
/// tried for the current instruction and didn't work.
|
||||
class CommutingState {
|
||||
/// There are effectively three states: the initial state where we haven't
|
||||
/// started commuting anything and we don't know anything yet, the tenative
|
||||
/// state where we've commuted the operands of the current instruction and are
|
||||
/// revisting it, and the declined state where we've reverted the operands
|
||||
/// back to their original order and will no longer commute it further.
|
||||
bool TentativelyCommuting;
|
||||
bool Declined;
|
||||
|
||||
/// During the tentative state, these hold the operand indices of the commuted
|
||||
/// operands.
|
||||
unsigned Operand0, Operand1;
|
||||
|
||||
public:
|
||||
CommutingState() : TentativelyCommuting(false), Declined(false) {}
|
||||
|
||||
/// Stackification for an operand was not successful due to ordering
|
||||
/// constraints. If possible, and if we haven't already tried it and declined
|
||||
/// it, commute Insert's operands and prepare to revisit it.
|
||||
void MaybeCommute(MachineInstr *Insert, TreeWalkerState &TreeWalker,
|
||||
const WebAssemblyInstrInfo *TII) {
|
||||
if (TentativelyCommuting) {
|
||||
assert(!Declined &&
|
||||
"Don't decline commuting until you've finished trying it");
|
||||
// Commuting didn't help. Revert it.
|
||||
TII->commuteInstruction(Insert, /*NewMI=*/false, Operand0, Operand1);
|
||||
TentativelyCommuting = false;
|
||||
Declined = true;
|
||||
} else if (!Declined && TreeWalker.HasRemainingOperands(Insert)) {
|
||||
Operand0 = TargetInstrInfo::CommuteAnyOperandIndex;
|
||||
Operand1 = TargetInstrInfo::CommuteAnyOperandIndex;
|
||||
if (TII->findCommutedOpIndices(Insert, Operand0, Operand1)) {
|
||||
// Tentatively commute the operands and try again.
|
||||
TII->commuteInstruction(Insert, /*NewMI=*/false, Operand0, Operand1);
|
||||
TreeWalker.ResetTopOperands(Insert);
|
||||
TentativelyCommuting = true;
|
||||
Declined = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Stackification for some operand was successful. Reset to the default
|
||||
/// state.
|
||||
void Reset() {
|
||||
TentativelyCommuting = false;
|
||||
Declined = false;
|
||||
}
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
|
||||
DEBUG(dbgs() << "********** Register Stackifying **********\n"
|
||||
"********** Function: "
|
||||
@ -144,6 +391,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
|
||||
const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
|
||||
const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
|
||||
AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
|
||||
MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
|
||||
LiveIntervals &LIS = getAnalysis<LiveIntervals>();
|
||||
|
||||
// Walk the instructions from the bottom up. Currently we don't look past
|
||||
@ -165,20 +413,36 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
|
||||
|
||||
// Iterate through the inputs in reverse order, since we'll be pulling
|
||||
// operands off the stack in LIFO order.
|
||||
bool AnyStackified = false;
|
||||
for (MachineOperand &Op : reverse(Insert->uses())) {
|
||||
CommutingState Commuting;
|
||||
TreeWalkerState TreeWalker(Insert);
|
||||
while (!TreeWalker.Done()) {
|
||||
MachineOperand &Op = TreeWalker.Pop();
|
||||
|
||||
// We're only interested in explicit virtual register operands.
|
||||
if (!Op.isReg() || Op.isImplicit() || !Op.isUse())
|
||||
if (!Op.isReg())
|
||||
continue;
|
||||
|
||||
unsigned Reg = Op.getReg();
|
||||
|
||||
// Only consider registers with a single definition.
|
||||
// TODO: Eventually we may relax this, to stackify phi transfers.
|
||||
MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
|
||||
if (!Def)
|
||||
assert(Op.isUse() && "explicit_uses() should only iterate over uses");
|
||||
assert(!Op.isImplicit() &&
|
||||
"explicit_uses() should only iterate over explicit operands");
|
||||
if (TargetRegisterInfo::isPhysicalRegister(Reg))
|
||||
continue;
|
||||
|
||||
// Identify the definition for this register at this point. Most
|
||||
// registers are in SSA form here so we try a quick MRI query first.
|
||||
MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
|
||||
if (!Def) {
|
||||
// MRI doesn't know what the Def is. Try asking LIS.
|
||||
const VNInfo *ValNo = LIS.getInterval(Reg).getVNInfoBefore(
|
||||
LIS.getInstructionIndex(Insert));
|
||||
if (!ValNo)
|
||||
continue;
|
||||
Def = LIS.getInstructionFromIndex(ValNo->def);
|
||||
if (!Def)
|
||||
continue;
|
||||
}
|
||||
|
||||
// Don't nest an INLINE_ASM def into anything, because we don't have
|
||||
// constraints for $pop outputs.
|
||||
if (Def->getOpcode() == TargetOpcode::INLINEASM)
|
||||
@ -196,59 +460,52 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
|
||||
Def->getOpcode() == WebAssembly::ARGUMENT_F64)
|
||||
continue;
|
||||
|
||||
if (MRI.hasOneUse(Reg) && Def->getParent() == &MBB &&
|
||||
IsSafeToMove(Def, Insert, AA, LIS, MRI)) {
|
||||
// A single-use def in the same block with no intervening memory or
|
||||
// register dependencies; move the def down and nest it with the
|
||||
// current instruction.
|
||||
// TODO: Stackify multiple-use values, taking advantage of set_local
|
||||
// returning its result.
|
||||
Changed = true;
|
||||
AnyStackified = true;
|
||||
MBB.splice(Insert, &MBB, Def);
|
||||
LIS.handleMove(Def);
|
||||
MFI.stackifyVReg(Reg);
|
||||
ImposeStackOrdering(Def);
|
||||
Insert = Def;
|
||||
// Decide which strategy to take. Prefer to move a single-use value
|
||||
// over cloning it, and prefer cloning over introducing a tee_local.
|
||||
// For moving, we require the def to be in the same block as the use;
|
||||
// this makes things simpler (LiveIntervals' handleMove function only
|
||||
// supports intra-block moves) and it's MachineSink's job to catch all
|
||||
// the sinking opportunities anyway.
|
||||
bool SameBlock = Def->getParent() == &MBB;
|
||||
bool CanMove = SameBlock && IsSafeToMove(Def, Insert, AA, LIS, MRI);
|
||||
if (CanMove && MRI.hasOneUse(Reg)) {
|
||||
Insert = MoveForSingleUse(Reg, Def, MBB, Insert, LIS, MFI);
|
||||
} else if (Def->isAsCheapAsAMove() &&
|
||||
TII->isTriviallyReMaterializable(Def, &AA)) {
|
||||
// A trivially cloneable instruction; clone it and nest the new copy
|
||||
// with the current instruction.
|
||||
Changed = true;
|
||||
AnyStackified = true;
|
||||
unsigned OldReg = Def->getOperand(0).getReg();
|
||||
unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
|
||||
TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI);
|
||||
Op.setReg(NewReg);
|
||||
MachineInstr *Clone =
|
||||
&*std::prev(MachineBasicBlock::instr_iterator(Insert));
|
||||
LIS.InsertMachineInstrInMaps(Clone);
|
||||
LIS.createAndComputeVirtRegInterval(NewReg);
|
||||
MFI.stackifyVReg(NewReg);
|
||||
ImposeStackOrdering(Clone);
|
||||
Insert = Clone;
|
||||
|
||||
// If that was the last use of the original, delete the original.
|
||||
// Otherwise shrink the LiveInterval.
|
||||
if (MRI.use_empty(OldReg)) {
|
||||
SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot();
|
||||
LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
|
||||
LIS.removeVRegDefAt(LIS.getInterval(OldReg), Idx);
|
||||
LIS.removeInterval(OldReg);
|
||||
LIS.RemoveMachineInstrFromMaps(Def);
|
||||
Def->eraseFromParent();
|
||||
} else {
|
||||
LIS.shrinkToUses(&LIS.getInterval(OldReg));
|
||||
}
|
||||
Insert = RematerializeCheapDef(Reg, Op, Def, MBB, Insert, LIS, MFI,
|
||||
MRI, TII, TRI);
|
||||
} else if (CanMove &&
|
||||
OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT)) {
|
||||
Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
|
||||
MRI, TII);
|
||||
} else {
|
||||
// We failed to stackify the operand. If the problem was ordering
|
||||
// constraints, Commuting may be able to help.
|
||||
if (!CanMove && SameBlock)
|
||||
Commuting.MaybeCommute(Insert, TreeWalker, TII);
|
||||
// Proceed to the next operand.
|
||||
continue;
|
||||
}
|
||||
|
||||
// We stackified an operand. Add the defining instruction's operands to
|
||||
// the worklist stack now to continue to build an ever deeper tree.
|
||||
Commuting.Reset();
|
||||
TreeWalker.PushOperands(Insert);
|
||||
}
|
||||
if (AnyStackified)
|
||||
|
||||
// If we stackified any operands, skip over the tree to start looking for
|
||||
// the next instruction we can build a tree on.
|
||||
if (Insert != &*MII) {
|
||||
ImposeStackOrdering(&*MII);
|
||||
MII = std::prev(
|
||||
make_reverse_iterator(MachineBasicBlock::iterator(Insert)));
|
||||
Changed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we used EXPR_STACK anywhere, add it to the live-in sets everywhere
|
||||
// so that it never looks like a use-before-def.
|
||||
// If we used EXPR_STACK anywhere, add it to the live-in sets everywhere so
|
||||
// that it never looks like a use-before-def.
|
||||
if (Changed) {
|
||||
MF.getRegInfo().addLiveIn(WebAssembly::EXPR_STACK);
|
||||
for (MachineBasicBlock &MBB : MF)
|
||||
@ -263,23 +520,25 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
|
||||
for (MachineOperand &MO : reverse(MI.explicit_operands())) {
|
||||
if (!MO.isReg())
|
||||
continue;
|
||||
unsigned VReg = MO.getReg();
|
||||
unsigned Reg = MO.getReg();
|
||||
|
||||
// Don't stackify physregs like SP or FP.
|
||||
if (!TargetRegisterInfo::isVirtualRegister(VReg))
|
||||
if (!TargetRegisterInfo::isVirtualRegister(Reg))
|
||||
continue;
|
||||
|
||||
if (MFI.isVRegStackified(VReg)) {
|
||||
if (MFI.isVRegStackified(Reg)) {
|
||||
if (MO.isDef())
|
||||
Stack.push_back(VReg);
|
||||
Stack.push_back(Reg);
|
||||
else
|
||||
assert(Stack.pop_back_val() == VReg);
|
||||
assert(Stack.pop_back_val() == Reg &&
|
||||
"Register stack pop should be paired with a push");
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: Generalize this code to support keeping values on the stack across
|
||||
// basic block boundaries.
|
||||
assert(Stack.empty());
|
||||
assert(Stack.empty() &&
|
||||
"Register stack pushes and pops should be balanced");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -979,7 +979,7 @@ bb6:
|
||||
; OPT-LABEL: test11:
|
||||
; OPT: block{{$}}
|
||||
; OPT-NEXT: block{{$}}
|
||||
; OPT: br_if $0, 0{{$}}
|
||||
; OPT: br_if $pop{{[0-9]+}}, 0{{$}}
|
||||
; OPT-NEXT: block{{$}}
|
||||
; OPT-NOT: block
|
||||
; OPT: br_if $0, 0{{$}}
|
||||
@ -1121,31 +1121,33 @@ bb7:
|
||||
; CHECK-LABEL: test13:
|
||||
; CHECK-NEXT: .local i32{{$}}
|
||||
; CHECK: block{{$}}
|
||||
; CHECK: br_if $pop4, 0{{$}}
|
||||
; CHECK: br_if $pop5, 0{{$}}
|
||||
; CHECK-NEXT: return{{$}}
|
||||
; CHECK-NEXT: .LBB22_2:
|
||||
; CHECK-NEXT: end_block{{$}}
|
||||
; CHECK: block{{$}}
|
||||
; CHECK-NEXT: br_if $0, 0{{$}}
|
||||
; CHECK-NEXT: i32.const $push3=, 0{{$}}
|
||||
; CHECK-NEXT: br_if $pop3, 0{{$}}
|
||||
; CHECK: .LBB22_4:
|
||||
; CHECK-NEXT: end_block{{$}}
|
||||
; CHECK: block{{$}}
|
||||
; CHECK: br_if $pop6, 0{{$}}
|
||||
; CHECK: br_if $pop7, 0{{$}}
|
||||
; CHECK-NEXT: end_block{{$}}
|
||||
; CHECK-NEXT: unreachable{{$}}
|
||||
; OPT-LABEL: test13:
|
||||
; OPT-NEXT: .local i32{{$}}
|
||||
; OPT: block{{$}}
|
||||
; OPT: br_if $pop4, 0{{$}}
|
||||
; OPT: br_if $pop5, 0{{$}}
|
||||
; OPT-NEXT: return{{$}}
|
||||
; OPT-NEXT: .LBB22_2:
|
||||
; OPT-NEXT: end_block{{$}}
|
||||
; OPT: block{{$}}
|
||||
; OPT-NEXT: br_if $0, 0{{$}}
|
||||
; OPT-NEXT: i32.const $push3=, 0{{$}}
|
||||
; OPT-NEXT: br_if $pop3, 0{{$}}
|
||||
; OPT: .LBB22_4:
|
||||
; OPT-NEXT: end_block{{$}}
|
||||
; OPT: block{{$}}
|
||||
; OPT: br_if $pop6, 0{{$}}
|
||||
; OPT: br_if $pop7, 0{{$}}
|
||||
; OPT-NEXT: end_block{{$}}
|
||||
; OPT-NEXT: unreachable{{$}}
|
||||
define void @test13() noinline optnone {
|
||||
|
@ -90,17 +90,18 @@ false:
|
||||
}
|
||||
|
||||
; Test an interesting case where the load has multiple uses and cannot
|
||||
; be trivially stackified.
|
||||
; be trivially stackified. However, it can be stackified with a tee_local.
|
||||
|
||||
; CHECK-LABEL: multiple_uses:
|
||||
; CHECK-NEXT: .param i32, i32, i32{{$}}
|
||||
; CHECK-NEXT: .local i32{{$}}
|
||||
; CHECK-NEXT: i32.load $3=, 0($2){{$}}
|
||||
; CHECK-NEXT: block{{$}}
|
||||
; CHECK-NEXT: i32.ge_u $push0=, $3, $1{{$}}
|
||||
; CHECK-NEXT: br_if $pop0, 0{{$}}
|
||||
; CHECK-NEXT: i32.lt_u $push1=, $3, $0{{$}}
|
||||
; CHECK-NEXT: i32.load $push0=, 0($2){{$}}
|
||||
; CHECK-NEXT: tee_local $push3=, $3=, $pop0{{$}}
|
||||
; CHECK-NEXT: i32.ge_u $push1=, $pop3, $1{{$}}
|
||||
; CHECK-NEXT: br_if $pop1, 0{{$}}
|
||||
; CHECK-NEXT: i32.lt_u $push2=, $3, $0{{$}}
|
||||
; CHECK-NEXT: br_if $pop2, 0{{$}}
|
||||
; CHECK-NEXT: i32.store $discard=, 0($2), $3{{$}}
|
||||
; CHECK-NEXT: .LBB5_3:
|
||||
; CHECK-NEXT: end_block{{$}}
|
||||
@ -145,4 +146,102 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Div instructions have side effects and can't be reordered, but this entire
|
||||
; function should still be able to be stackified because it's already in
|
||||
; tree order.
|
||||
|
||||
; CHECK-LABEL: div_tree:
|
||||
; CHECK-NEXT: .param i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32{{$}}
|
||||
; CHECK-NEXT: .result i32{{$}}
|
||||
; CHECK-NEXT: i32.div_s $push0=, $0, $1
|
||||
; CHECK-NEXT: i32.div_s $push1=, $2, $3
|
||||
; CHECK-NEXT: i32.div_s $push2=, $pop0, $pop1
|
||||
; CHECK-NEXT: i32.div_s $push3=, $4, $5
|
||||
; CHECK-NEXT: i32.div_s $push4=, $6, $7
|
||||
; CHECK-NEXT: i32.div_s $push5=, $pop3, $pop4
|
||||
; CHECK-NEXT: i32.div_s $push6=, $pop2, $pop5
|
||||
; CHECK-NEXT: i32.div_s $push7=, $8, $9
|
||||
; CHECK-NEXT: i32.div_s $push8=, $10, $11
|
||||
; CHECK-NEXT: i32.div_s $push9=, $pop7, $pop8
|
||||
; CHECK-NEXT: i32.div_s $push10=, $12, $13
|
||||
; CHECK-NEXT: i32.div_s $push11=, $14, $15
|
||||
; CHECK-NEXT: i32.div_s $push12=, $pop10, $pop11
|
||||
; CHECK-NEXT: i32.div_s $push13=, $pop9, $pop12
|
||||
; CHECK-NEXT: i32.div_s $push14=, $pop6, $pop13
|
||||
; CHECK-NEXT: return $pop14
|
||||
define i32 @div_tree(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p) {
|
||||
entry:
|
||||
%div = sdiv i32 %a, %b
|
||||
%div1 = sdiv i32 %c, %d
|
||||
%div2 = sdiv i32 %div, %div1
|
||||
%div3 = sdiv i32 %e, %f
|
||||
%div4 = sdiv i32 %g, %h
|
||||
%div5 = sdiv i32 %div3, %div4
|
||||
%div6 = sdiv i32 %div2, %div5
|
||||
%div7 = sdiv i32 %i, %j
|
||||
%div8 = sdiv i32 %k, %l
|
||||
%div9 = sdiv i32 %div7, %div8
|
||||
%div10 = sdiv i32 %m, %n
|
||||
%div11 = sdiv i32 %o, %p
|
||||
%div12 = sdiv i32 %div10, %div11
|
||||
%div13 = sdiv i32 %div9, %div12
|
||||
%div14 = sdiv i32 %div6, %div13
|
||||
ret i32 %div14
|
||||
}
|
||||
|
||||
; A simple multiple-use case.
|
||||
|
||||
; CHECK-LABEL: simple_multiple_use:
|
||||
; CHECK-NEXT: .param i32, i32{{$}}
|
||||
; CHECK-NEXT: i32.mul $push0=, $1, $0{{$}}
|
||||
; CHECK-NEXT: tee_local $push1=, $0=, $pop0{{$}}
|
||||
; CHECK-NEXT: call use_a@FUNCTION, $pop1{{$}}
|
||||
; CHECK-NEXT: call use_b@FUNCTION, $0{{$}}
|
||||
; CHECK-NEXT: return{{$}}
|
||||
declare void @use_a(i32)
|
||||
declare void @use_b(i32)
|
||||
define void @simple_multiple_use(i32 %x, i32 %y) {
|
||||
%mul = mul i32 %y, %x
|
||||
call void @use_a(i32 %mul)
|
||||
call void @use_b(i32 %mul)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Multiple uses of the same value in one instruction.
|
||||
|
||||
; CHECK-LABEL: multiple_uses_in_same_insn:
|
||||
; CHECK-NEXT: .param i32, i32{{$}}
|
||||
; CHECK-NEXT: i32.mul $push0=, $1, $0{{$}}
|
||||
; CHECK-NEXT: tee_local $push1=, $0=, $pop0{{$}}
|
||||
; CHECK-NEXT: call use_2@FUNCTION, $pop1, $0{{$}}
|
||||
; CHECK-NEXT: return{{$}}
|
||||
declare void @use_2(i32, i32)
|
||||
define void @multiple_uses_in_same_insn(i32 %x, i32 %y) {
|
||||
%mul = mul i32 %y, %x
|
||||
call void @use_2(i32 %mul, i32 %mul)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Commute operands to achieve better stackifying.
|
||||
|
||||
; CHECK-LABEL: commute:
|
||||
; CHECK-NEXT: .result i32{{$}}
|
||||
; CHECK-NEXT: i32.call $push0=, red@FUNCTION{{$}}
|
||||
; CHECK-NEXT: i32.call $push1=, green@FUNCTION{{$}}
|
||||
; CHECK-NEXT: i32.add $push2=, $pop0, $pop1{{$}}
|
||||
; CHECK-NEXT: i32.call $push3=, blue@FUNCTION{{$}}
|
||||
; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
|
||||
; CHECK-NEXT: return $pop4{{$}}
|
||||
declare i32 @red()
|
||||
declare i32 @green()
|
||||
declare i32 @blue()
|
||||
define i32 @commute() {
|
||||
%call = call i32 @red()
|
||||
%call1 = call i32 @green()
|
||||
%add = add i32 %call1, %call
|
||||
%call2 = call i32 @blue()
|
||||
%add3 = add i32 %add, %call2
|
||||
ret i32 %add3
|
||||
}
|
||||
|
||||
!0 = !{}
|
||||
|
@ -49,12 +49,13 @@ entry:
|
||||
; CHECK-NEXT: .param i32{{$}}
|
||||
; CHECK-NEXT: .result i32{{$}}
|
||||
; CHECK-NEXT: .local i32{{$}}
|
||||
; CHECK-NEXT: i32.load $1=, 0($0){{$}}
|
||||
; CHECK-NEXT: i32.const $push0=, 4{{$}}
|
||||
; CHECK-NEXT: i32.add $push1=, $1, $pop0{{$}}
|
||||
; CHECK-NEXT: i32.store $discard=, 0($0), $pop1{{$}}
|
||||
; CHECK-NEXT: i32.load $push2=, 0($1){{$}}
|
||||
; CHECK-NEXT: return $pop2{{$}}
|
||||
; CHECK-NEXT: i32.load $push0=, 0($0){{$}}
|
||||
; CHECK-NEXT: tee_local $push4=, $1=, $pop0{{$}}
|
||||
; CHECK-NEXT: i32.const $push1=, 4{{$}}
|
||||
; CHECK-NEXT: i32.add $push2=, $pop4, $pop1{{$}}
|
||||
; CHECK-NEXT: i32.store $discard=, 0($0), $pop2{{$}}
|
||||
; CHECK-NEXT: i32.load $push3=, 0($1){{$}}
|
||||
; CHECK-NEXT: return $pop3{{$}}
|
||||
define i8 @arg_i8(i8** %ap) {
|
||||
entry:
|
||||
%t = va_arg i8** %ap, i8
|
||||
@ -71,12 +72,13 @@ entry:
|
||||
; CHECK-NEXT: i32.const $push1=, 3{{$}}
|
||||
; CHECK-NEXT: i32.add $push2=, $pop0, $pop1{{$}}
|
||||
; CHECK-NEXT: i32.const $push3=, -4{{$}}
|
||||
; CHECK-NEXT: i32.and $1=, $pop2, $pop3{{$}}
|
||||
; CHECK-NEXT: i32.const $push4=, 4{{$}}
|
||||
; CHECK-NEXT: i32.add $push5=, $1, $pop4{{$}}
|
||||
; CHECK-NEXT: i32.store $discard=, 0($0), $pop5{{$}}
|
||||
; CHECK-NEXT: i32.load $push6=, 0($1){{$}}
|
||||
; CHECK-NEXT: return $pop6{{$}}
|
||||
; CHECK-NEXT: i32.and $push4=, $pop2, $pop3{{$}}
|
||||
; CHECK-NEXT: tee_local $push8=, $1=, $pop4{{$}}
|
||||
; CHECK-NEXT: i32.const $push5=, 4{{$}}
|
||||
; CHECK-NEXT: i32.add $push6=, $pop8, $pop5{{$}}
|
||||
; CHECK-NEXT: i32.store $discard=, 0($0), $pop6{{$}}
|
||||
; CHECK-NEXT: i32.load $push7=, 0($1){{$}}
|
||||
; CHECK-NEXT: return $pop7{{$}}
|
||||
define i32 @arg_i32(i8** %ap) {
|
||||
entry:
|
||||
%t = va_arg i8** %ap, i32
|
||||
|
Loading…
Reference in New Issue
Block a user