From 2e8acd83fdc586af7a15cf3c2d4a7a86ca8acd84 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Thu, 28 Jan 2016 01:22:44 +0000 Subject: [PATCH] [WebAssembly] Enhanced register stackification This patch revamps the RegStackifier pass with a new tree traversal mechanism, enabling three major new features: - Stackification of values with multiple uses, using the result value of set_local - More aggressive stackification of instructions with side effects - Reordering operands in commutative instructions to enable more stackification. llvm-svn: 259009 --- include/llvm/CodeGen/MachineInstr.h | 8 + lib/Target/WebAssembly/README.txt | 20 +- .../WebAssembly/WebAssemblyInstrInfo.cpp | 16 + lib/Target/WebAssembly/WebAssemblyInstrInfo.h | 3 + .../WebAssembly/WebAssemblyInstrInfo.td | 16 +- .../WebAssembly/WebAssemblyRegStackify.cpp | 383 +++++++++++++++--- test/CodeGen/WebAssembly/cfg-stackify.ll | 16 +- test/CodeGen/WebAssembly/reg-stackify.ll | 109 ++++- test/CodeGen/WebAssembly/varargs.ll | 26 +- 9 files changed, 497 insertions(+), 100 deletions(-) diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h index 05c9a9e0b07..f3100ccf633 100644 --- a/include/llvm/CodeGen/MachineInstr.h +++ b/include/llvm/CodeGen/MachineInstr.h @@ -343,6 +343,14 @@ public: return make_range(operands_begin() + getDesc().getNumDefs(), operands_end()); } + iterator_range explicit_uses() { + return make_range(operands_begin() + getDesc().getNumDefs(), + operands_begin() + getNumExplicitOperands() ); + } + iterator_range explicit_uses() const { + return make_range(operands_begin() + getDesc().getNumDefs(), + operands_begin() + getNumExplicitOperands() ); + } /// Returns the number of the operand iterator \p I points to. unsigned getOperandNo(const_mop_iterator I) const { diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt index 5415d25c3dd..f4417e930c8 100644 --- a/lib/Target/WebAssembly/README.txt +++ b/lib/Target/WebAssembly/README.txt @@ -32,12 +32,6 @@ Interesting work that remains to be done: //===---------------------------------------------------------------------===// -set_local instructions have a return value. We should (a) model this, -and (b) write optimizations which take advantage of it. Keep in mind that -many set_local instructions are implicit! - -//===---------------------------------------------------------------------===// - Br, br_if, and tableswitch instructions can support having a value on the expression stack across the jump (sometimes). We should (a) model this, and (b) extend the stackifier to utilize it. @@ -87,3 +81,17 @@ Find a clean way to fix the problem which leads to the Shrink Wrapping pass being run after the WebAssembly PEI pass. //===---------------------------------------------------------------------===// + +When setting multiple local variables to the same constant, we currently get +code like this: + + i32.const $4=, 0 + i32.const $3=, 0 + +It could be done with a smaller encoding like this: + + i32.const $push5=, 0 + tee_local $push6=, $4=, $pop5 + copy_local $3=, $pop6 + +//===---------------------------------------------------------------------===// diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index 37dd0c4026a..00aca688af4 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -15,6 +15,7 @@ #include "WebAssemblyInstrInfo.h" #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblySubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -75,6 +76,21 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addReg(SrcReg, KillSrc ? RegState::Kill : 0); } +MachineInstr * +WebAssemblyInstrInfo::commuteInstructionImpl(MachineInstr *MI, bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { + // If the operands are stackified, we can't reorder them. + WebAssemblyFunctionInfo &MFI = + *MI->getParent()->getParent()->getInfo(); + if (MFI.isVRegStackified(MI->getOperand(OpIdx1).getReg()) || + MFI.isVRegStackified(MI->getOperand(OpIdx2).getReg())) + return nullptr; + + // Otherwise use the default implementation. + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); +} + // Branch analysis. bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h index bf0c1dc6c95..8da99ac755d 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h @@ -40,6 +40,9 @@ public: void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; + MachineInstr *commuteInstructionImpl(MachineInstr *MI, bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const override; bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index f761606f2e8..60e5d6ea645 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -107,15 +107,8 @@ defm : ARGUMENT; let Defs = [ARGUMENTS] in { // get_local and set_local are not generated by instruction selection; they -// are implied by virtual register uses and defs in most contexts. However, -// they are explicitly emitted for special purposes. +// are implied by virtual register uses and defs. multiclass LOCAL { - def GET_LOCAL_#vt : I<(outs vt:$res), (ins i32imm:$regno), [], - "get_local\t$res, $regno">; - // TODO: set_local returns its operand value - def SET_LOCAL_#vt : I<(outs), (ins i32imm:$regno, vt:$src), [], - "set_local\t$regno, $src">; - // COPY_LOCAL is not an actual instruction in wasm, but since we allow // get_local and set_local to be implicit, we can have a COPY_LOCAL which // is actually a no-op because all the work is done in the implied @@ -123,6 +116,13 @@ multiclass LOCAL { let isAsCheapAsAMove = 1 in def COPY_LOCAL_#vt : I<(outs vt:$res), (ins vt:$src), [], "copy_local\t$res, $src">; + + // TEE_LOCAL is similar to COPY_LOCAL, but writes two copies of its result. + // Typically this would be used to stackify one result and write the other + // result to a local. + let isAsCheapAsAMove = 1 in + def TEE_LOCAL_#vt : I<(outs vt:$res, vt:$also), (ins vt:$src), [], + "tee_local\t$res, $also, $src">; } defm : LOCAL; defm : LOCAL; diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index df9e6de5ffc..3377ca0979d 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -27,6 +27,8 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Support/Debug.h" @@ -44,12 +46,13 @@ class WebAssemblyRegStackify final : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); - AU.addPreservedID(MachineDominatorsID); AU.addPreservedID(LiveVariablesID); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -89,8 +92,8 @@ static void ImposeStackOrdering(MachineInstr *MI) { // TODO: Compute memory dependencies in a way that uses AliasAnalysis to be // more precise. static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert, - AliasAnalysis &AA, LiveIntervals &LIS, - MachineRegisterInfo &MRI) { + AliasAnalysis &AA, const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { assert(Def->getParent() == Insert->getParent()); bool SawStore = false, SawSideEffects = false; MachineBasicBlock::const_iterator D(Def), I(Insert); @@ -133,6 +136,250 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert, !(SawSideEffects && !Def->isSafeToMove(&AA, SawStore)); } +/// Test whether OneUse, a use of Reg, dominates all of Reg's other uses. +static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse, + const MachineBasicBlock &MBB, + const MachineRegisterInfo &MRI, + const MachineDominatorTree &MDT) { + for (const MachineOperand &Use : MRI.use_operands(Reg)) { + if (&Use == &OneUse) + continue; + const MachineInstr *UseInst = Use.getParent(); + const MachineInstr *OneUseInst = OneUse.getParent(); + if (UseInst->getOpcode() == TargetOpcode::PHI) { + // Test that the PHI use, which happens on the CFG edge rather than + // within the PHI's own block, is dominated by the one selected use. + const MachineBasicBlock *Pred = + UseInst->getOperand(&Use - &UseInst->getOperand(0) + 1).getMBB(); + if (!MDT.dominates(&MBB, Pred)) + return false; + } else if (UseInst == OneUseInst) { + // Another use in the same instruction. We need to ensure that the one + // selected use happens "before" it. + if (&OneUse > &Use) + return false; + } else { + // Test that the use is dominated by the one selected use. + if (!MDT.dominates(OneUseInst, UseInst)) + return false; + } + } + return true; +} + +/// Get the appropriate tee_local opcode for the given register class. +static unsigned GetTeeLocalOpcode(const TargetRegisterClass *RC) { + if (RC == &WebAssembly::I32RegClass) + return WebAssembly::TEE_LOCAL_I32; + if (RC == &WebAssembly::I64RegClass) + return WebAssembly::TEE_LOCAL_I64; + if (RC == &WebAssembly::F32RegClass) + return WebAssembly::TEE_LOCAL_F32; + if (RC == &WebAssembly::F64RegClass) + return WebAssembly::TEE_LOCAL_F64; + llvm_unreachable("Unexpected register class"); +} + +/// A single-use def in the same block with no intervening memory or register +/// dependencies; move the def down and nest it with the current instruction. +static MachineInstr *MoveForSingleUse(unsigned Reg, MachineInstr *Def, + MachineBasicBlock &MBB, + MachineInstr *Insert, LiveIntervals &LIS, + WebAssemblyFunctionInfo &MFI) { + MBB.splice(Insert, &MBB, Def); + LIS.handleMove(Def); + MFI.stackifyVReg(Reg); + ImposeStackOrdering(Def); + return Def; +} + +/// A trivially cloneable instruction; clone it and nest the new copy with the +/// current instruction. +static MachineInstr * +RematerializeCheapDef(unsigned Reg, MachineOperand &Op, MachineInstr *Def, + MachineBasicBlock &MBB, MachineInstr *Insert, + LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI, + MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII, + const WebAssemblyRegisterInfo *TRI) { + unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); + TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI); + Op.setReg(NewReg); + MachineInstr *Clone = &*std::prev(MachineBasicBlock::instr_iterator(Insert)); + LIS.InsertMachineInstrInMaps(Clone); + LIS.createAndComputeVirtRegInterval(NewReg); + MFI.stackifyVReg(NewReg); + ImposeStackOrdering(Clone); + + // If that was the last use of the original, delete the original. + // Otherwise shrink the LiveInterval. + if (MRI.use_empty(Reg)) { + SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot(); + LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx); + LIS.removeVRegDefAt(LIS.getInterval(Reg), Idx); + LIS.removeInterval(Reg); + LIS.RemoveMachineInstrFromMaps(Def); + Def->eraseFromParent(); + } else { + LIS.shrinkToUses(&LIS.getInterval(Reg)); + } + return Clone; +} + +/// A multiple-use def in the same block with no intervening memory or register +/// dependencies; move the def down, nest it with the current instruction, and +/// insert a tee_local to satisfy the rest of the uses. As an illustration, +/// rewrite this: +/// +/// Reg = INST ... // Def +/// INST ..., Reg, ... // Insert +/// INST ..., Reg, ... +/// INST ..., Reg, ... +/// +/// to this: +/// +/// Reg = INST ... // Def (to become the new Insert) +/// TeeReg, NewReg = TEE_LOCAL_... Reg +/// INST ..., TeeReg, ... // Insert +/// INST ..., NewReg, ... +/// INST ..., NewReg, ... +/// +/// with Reg and TeeReg stackified. This eliminates a get_local from the +/// resulting code. +static MachineInstr *MoveAndTeeForMultiUse( + unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB, + MachineInstr *Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI, + MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII) { + MBB.splice(Insert, &MBB, Def); + LIS.handleMove(Def); + const auto *RegClass = MRI.getRegClass(Reg); + unsigned NewReg = MRI.createVirtualRegister(RegClass); + unsigned TeeReg = MRI.createVirtualRegister(RegClass); + MRI.replaceRegWith(Reg, NewReg); + MachineInstr *Tee = BuildMI(MBB, Insert, Insert->getDebugLoc(), + TII->get(GetTeeLocalOpcode(RegClass)), TeeReg) + .addReg(NewReg, RegState::Define) + .addReg(Reg); + Op.setReg(TeeReg); + Def->getOperand(0).setReg(Reg); + LIS.InsertMachineInstrInMaps(Tee); + LIS.shrinkToUses(&LIS.getInterval(Reg)); + LIS.createAndComputeVirtRegInterval(NewReg); + LIS.createAndComputeVirtRegInterval(TeeReg); + MFI.stackifyVReg(Reg); + MFI.stackifyVReg(TeeReg); + ImposeStackOrdering(Def); + ImposeStackOrdering(Tee); + return Def; +} + +namespace { +/// A stack for walking the tree of instructions being built, visiting the +/// MachineOperands in DFS order. +class TreeWalkerState { + typedef MachineInstr::mop_iterator mop_iterator; + typedef std::reverse_iterator mop_reverse_iterator; + typedef iterator_range RangeTy; + SmallVector Worklist; + +public: + explicit TreeWalkerState(MachineInstr *Insert) { + const iterator_range &Range = Insert->explicit_uses(); + if (Range.begin() != Range.end()) + Worklist.push_back(reverse(Range)); + } + + bool Done() const { return Worklist.empty(); } + + MachineOperand &Pop() { + RangeTy &Range = Worklist.back(); + MachineOperand &Op = *Range.begin(); + Range = drop_begin(Range, 1); + if (Range.begin() == Range.end()) + Worklist.pop_back(); + assert((Worklist.empty() || + Worklist.back().begin() != Worklist.back().end()) && + "Empty ranges shouldn't remain in the worklist"); + return Op; + } + + /// Push Instr's operands onto the stack to be visited. + void PushOperands(MachineInstr *Instr) { + const iterator_range &Range(Instr->explicit_uses()); + if (Range.begin() != Range.end()) + Worklist.push_back(reverse(Range)); + } + + /// Some of Instr's operands are on the top of the stack; remove them and + /// re-insert them starting from the beginning (because we've commuted them). + void ResetTopOperands(MachineInstr *Instr) { + assert(HasRemainingOperands(Instr) && + "Reseting operands should only be done when the instruction has " + "an operand still on the stack"); + Worklist.back() = reverse(Instr->explicit_uses()); + } + + /// Test whether Instr has operands remaining to be visited at the top of + /// the stack. + bool HasRemainingOperands(const MachineInstr *Instr) const { + if (Worklist.empty()) + return false; + const RangeTy &Range = Worklist.back(); + return Range.begin() != Range.end() && Range.begin()->getParent() == Instr; + } +}; + +/// State to keep track of whether commuting is in flight or whether it's been +/// tried for the current instruction and didn't work. +class CommutingState { + /// There are effectively three states: the initial state where we haven't + /// started commuting anything and we don't know anything yet, the tenative + /// state where we've commuted the operands of the current instruction and are + /// revisting it, and the declined state where we've reverted the operands + /// back to their original order and will no longer commute it further. + bool TentativelyCommuting; + bool Declined; + + /// During the tentative state, these hold the operand indices of the commuted + /// operands. + unsigned Operand0, Operand1; + +public: + CommutingState() : TentativelyCommuting(false), Declined(false) {} + + /// Stackification for an operand was not successful due to ordering + /// constraints. If possible, and if we haven't already tried it and declined + /// it, commute Insert's operands and prepare to revisit it. + void MaybeCommute(MachineInstr *Insert, TreeWalkerState &TreeWalker, + const WebAssemblyInstrInfo *TII) { + if (TentativelyCommuting) { + assert(!Declined && + "Don't decline commuting until you've finished trying it"); + // Commuting didn't help. Revert it. + TII->commuteInstruction(Insert, /*NewMI=*/false, Operand0, Operand1); + TentativelyCommuting = false; + Declined = true; + } else if (!Declined && TreeWalker.HasRemainingOperands(Insert)) { + Operand0 = TargetInstrInfo::CommuteAnyOperandIndex; + Operand1 = TargetInstrInfo::CommuteAnyOperandIndex; + if (TII->findCommutedOpIndices(Insert, Operand0, Operand1)) { + // Tentatively commute the operands and try again. + TII->commuteInstruction(Insert, /*NewMI=*/false, Operand0, Operand1); + TreeWalker.ResetTopOperands(Insert); + TentativelyCommuting = true; + Declined = false; + } + } + } + + /// Stackification for some operand was successful. Reset to the default + /// state. + void Reset() { + TentativelyCommuting = false; + Declined = false; + } +}; +} // end anonymous namespace + bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** Register Stackifying **********\n" "********** Function: " @@ -144,6 +391,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { const auto *TII = MF.getSubtarget().getInstrInfo(); const auto *TRI = MF.getSubtarget().getRegisterInfo(); AliasAnalysis &AA = getAnalysis().getAAResults(); + MachineDominatorTree &MDT = getAnalysis(); LiveIntervals &LIS = getAnalysis(); // Walk the instructions from the bottom up. Currently we don't look past @@ -165,20 +413,36 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { // Iterate through the inputs in reverse order, since we'll be pulling // operands off the stack in LIFO order. - bool AnyStackified = false; - for (MachineOperand &Op : reverse(Insert->uses())) { + CommutingState Commuting; + TreeWalkerState TreeWalker(Insert); + while (!TreeWalker.Done()) { + MachineOperand &Op = TreeWalker.Pop(); + // We're only interested in explicit virtual register operands. - if (!Op.isReg() || Op.isImplicit() || !Op.isUse()) + if (!Op.isReg()) continue; unsigned Reg = Op.getReg(); - - // Only consider registers with a single definition. - // TODO: Eventually we may relax this, to stackify phi transfers. - MachineInstr *Def = MRI.getUniqueVRegDef(Reg); - if (!Def) + assert(Op.isUse() && "explicit_uses() should only iterate over uses"); + assert(!Op.isImplicit() && + "explicit_uses() should only iterate over explicit operands"); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) continue; + // Identify the definition for this register at this point. Most + // registers are in SSA form here so we try a quick MRI query first. + MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (!Def) { + // MRI doesn't know what the Def is. Try asking LIS. + const VNInfo *ValNo = LIS.getInterval(Reg).getVNInfoBefore( + LIS.getInstructionIndex(Insert)); + if (!ValNo) + continue; + Def = LIS.getInstructionFromIndex(ValNo->def); + if (!Def) + continue; + } + // Don't nest an INLINE_ASM def into anything, because we don't have // constraints for $pop outputs. if (Def->getOpcode() == TargetOpcode::INLINEASM) @@ -196,59 +460,52 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { Def->getOpcode() == WebAssembly::ARGUMENT_F64) continue; - if (MRI.hasOneUse(Reg) && Def->getParent() == &MBB && - IsSafeToMove(Def, Insert, AA, LIS, MRI)) { - // A single-use def in the same block with no intervening memory or - // register dependencies; move the def down and nest it with the - // current instruction. - // TODO: Stackify multiple-use values, taking advantage of set_local - // returning its result. - Changed = true; - AnyStackified = true; - MBB.splice(Insert, &MBB, Def); - LIS.handleMove(Def); - MFI.stackifyVReg(Reg); - ImposeStackOrdering(Def); - Insert = Def; + // Decide which strategy to take. Prefer to move a single-use value + // over cloning it, and prefer cloning over introducing a tee_local. + // For moving, we require the def to be in the same block as the use; + // this makes things simpler (LiveIntervals' handleMove function only + // supports intra-block moves) and it's MachineSink's job to catch all + // the sinking opportunities anyway. + bool SameBlock = Def->getParent() == &MBB; + bool CanMove = SameBlock && IsSafeToMove(Def, Insert, AA, LIS, MRI); + if (CanMove && MRI.hasOneUse(Reg)) { + Insert = MoveForSingleUse(Reg, Def, MBB, Insert, LIS, MFI); } else if (Def->isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def, &AA)) { - // A trivially cloneable instruction; clone it and nest the new copy - // with the current instruction. - Changed = true; - AnyStackified = true; - unsigned OldReg = Def->getOperand(0).getReg(); - unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg)); - TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI); - Op.setReg(NewReg); - MachineInstr *Clone = - &*std::prev(MachineBasicBlock::instr_iterator(Insert)); - LIS.InsertMachineInstrInMaps(Clone); - LIS.createAndComputeVirtRegInterval(NewReg); - MFI.stackifyVReg(NewReg); - ImposeStackOrdering(Clone); - Insert = Clone; - - // If that was the last use of the original, delete the original. - // Otherwise shrink the LiveInterval. - if (MRI.use_empty(OldReg)) { - SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot(); - LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx); - LIS.removeVRegDefAt(LIS.getInterval(OldReg), Idx); - LIS.removeInterval(OldReg); - LIS.RemoveMachineInstrFromMaps(Def); - Def->eraseFromParent(); - } else { - LIS.shrinkToUses(&LIS.getInterval(OldReg)); - } + Insert = RematerializeCheapDef(Reg, Op, Def, MBB, Insert, LIS, MFI, + MRI, TII, TRI); + } else if (CanMove && + OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT)) { + Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI, + MRI, TII); + } else { + // We failed to stackify the operand. If the problem was ordering + // constraints, Commuting may be able to help. + if (!CanMove && SameBlock) + Commuting.MaybeCommute(Insert, TreeWalker, TII); + // Proceed to the next operand. + continue; } + + // We stackified an operand. Add the defining instruction's operands to + // the worklist stack now to continue to build an ever deeper tree. + Commuting.Reset(); + TreeWalker.PushOperands(Insert); } - if (AnyStackified) + + // If we stackified any operands, skip over the tree to start looking for + // the next instruction we can build a tree on. + if (Insert != &*MII) { ImposeStackOrdering(&*MII); + MII = std::prev( + make_reverse_iterator(MachineBasicBlock::iterator(Insert))); + Changed = true; + } } } - // If we used EXPR_STACK anywhere, add it to the live-in sets everywhere - // so that it never looks like a use-before-def. + // If we used EXPR_STACK anywhere, add it to the live-in sets everywhere so + // that it never looks like a use-before-def. if (Changed) { MF.getRegInfo().addLiveIn(WebAssembly::EXPR_STACK); for (MachineBasicBlock &MBB : MF) @@ -263,23 +520,25 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { for (MachineOperand &MO : reverse(MI.explicit_operands())) { if (!MO.isReg()) continue; - unsigned VReg = MO.getReg(); + unsigned Reg = MO.getReg(); // Don't stackify physregs like SP or FP. - if (!TargetRegisterInfo::isVirtualRegister(VReg)) + if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; - if (MFI.isVRegStackified(VReg)) { + if (MFI.isVRegStackified(Reg)) { if (MO.isDef()) - Stack.push_back(VReg); + Stack.push_back(Reg); else - assert(Stack.pop_back_val() == VReg); + assert(Stack.pop_back_val() == Reg && + "Register stack pop should be paired with a push"); } } } // TODO: Generalize this code to support keeping values on the stack across // basic block boundaries. - assert(Stack.empty()); + assert(Stack.empty() && + "Register stack pushes and pops should be balanced"); } #endif diff --git a/test/CodeGen/WebAssembly/cfg-stackify.ll b/test/CodeGen/WebAssembly/cfg-stackify.ll index a1ff4ee30e0..24844422fd1 100644 --- a/test/CodeGen/WebAssembly/cfg-stackify.ll +++ b/test/CodeGen/WebAssembly/cfg-stackify.ll @@ -979,7 +979,7 @@ bb6: ; OPT-LABEL: test11: ; OPT: block{{$}} ; OPT-NEXT: block{{$}} -; OPT: br_if $0, 0{{$}} +; OPT: br_if $pop{{[0-9]+}}, 0{{$}} ; OPT-NEXT: block{{$}} ; OPT-NOT: block ; OPT: br_if $0, 0{{$}} @@ -1121,31 +1121,33 @@ bb7: ; CHECK-LABEL: test13: ; CHECK-NEXT: .local i32{{$}} ; CHECK: block{{$}} -; CHECK: br_if $pop4, 0{{$}} +; CHECK: br_if $pop5, 0{{$}} ; CHECK-NEXT: return{{$}} ; CHECK-NEXT: .LBB22_2: ; CHECK-NEXT: end_block{{$}} ; CHECK: block{{$}} -; CHECK-NEXT: br_if $0, 0{{$}} +; CHECK-NEXT: i32.const $push3=, 0{{$}} +; CHECK-NEXT: br_if $pop3, 0{{$}} ; CHECK: .LBB22_4: ; CHECK-NEXT: end_block{{$}} ; CHECK: block{{$}} -; CHECK: br_if $pop6, 0{{$}} +; CHECK: br_if $pop7, 0{{$}} ; CHECK-NEXT: end_block{{$}} ; CHECK-NEXT: unreachable{{$}} ; OPT-LABEL: test13: ; OPT-NEXT: .local i32{{$}} ; OPT: block{{$}} -; OPT: br_if $pop4, 0{{$}} +; OPT: br_if $pop5, 0{{$}} ; OPT-NEXT: return{{$}} ; OPT-NEXT: .LBB22_2: ; OPT-NEXT: end_block{{$}} ; OPT: block{{$}} -; OPT-NEXT: br_if $0, 0{{$}} +; OPT-NEXT: i32.const $push3=, 0{{$}} +; OPT-NEXT: br_if $pop3, 0{{$}} ; OPT: .LBB22_4: ; OPT-NEXT: end_block{{$}} ; OPT: block{{$}} -; OPT: br_if $pop6, 0{{$}} +; OPT: br_if $pop7, 0{{$}} ; OPT-NEXT: end_block{{$}} ; OPT-NEXT: unreachable{{$}} define void @test13() noinline optnone { diff --git a/test/CodeGen/WebAssembly/reg-stackify.ll b/test/CodeGen/WebAssembly/reg-stackify.ll index 3b700e94e2b..0083b410626 100644 --- a/test/CodeGen/WebAssembly/reg-stackify.ll +++ b/test/CodeGen/WebAssembly/reg-stackify.ll @@ -90,17 +90,18 @@ false: } ; Test an interesting case where the load has multiple uses and cannot -; be trivially stackified. +; be trivially stackified. However, it can be stackified with a tee_local. ; CHECK-LABEL: multiple_uses: ; CHECK-NEXT: .param i32, i32, i32{{$}} ; CHECK-NEXT: .local i32{{$}} -; CHECK-NEXT: i32.load $3=, 0($2){{$}} ; CHECK-NEXT: block{{$}} -; CHECK-NEXT: i32.ge_u $push0=, $3, $1{{$}} -; CHECK-NEXT: br_if $pop0, 0{{$}} -; CHECK-NEXT: i32.lt_u $push1=, $3, $0{{$}} +; CHECK-NEXT: i32.load $push0=, 0($2){{$}} +; CHECK-NEXT: tee_local $push3=, $3=, $pop0{{$}} +; CHECK-NEXT: i32.ge_u $push1=, $pop3, $1{{$}} ; CHECK-NEXT: br_if $pop1, 0{{$}} +; CHECK-NEXT: i32.lt_u $push2=, $3, $0{{$}} +; CHECK-NEXT: br_if $pop2, 0{{$}} ; CHECK-NEXT: i32.store $discard=, 0($2), $3{{$}} ; CHECK-NEXT: .LBB5_3: ; CHECK-NEXT: end_block{{$}} @@ -145,4 +146,102 @@ entry: ret void } +; Div instructions have side effects and can't be reordered, but this entire +; function should still be able to be stackified because it's already in +; tree order. + +; CHECK-LABEL: div_tree: +; CHECK-NEXT: .param i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32{{$}} +; CHECK-NEXT: .result i32{{$}} +; CHECK-NEXT: i32.div_s $push0=, $0, $1 +; CHECK-NEXT: i32.div_s $push1=, $2, $3 +; CHECK-NEXT: i32.div_s $push2=, $pop0, $pop1 +; CHECK-NEXT: i32.div_s $push3=, $4, $5 +; CHECK-NEXT: i32.div_s $push4=, $6, $7 +; CHECK-NEXT: i32.div_s $push5=, $pop3, $pop4 +; CHECK-NEXT: i32.div_s $push6=, $pop2, $pop5 +; CHECK-NEXT: i32.div_s $push7=, $8, $9 +; CHECK-NEXT: i32.div_s $push8=, $10, $11 +; CHECK-NEXT: i32.div_s $push9=, $pop7, $pop8 +; CHECK-NEXT: i32.div_s $push10=, $12, $13 +; CHECK-NEXT: i32.div_s $push11=, $14, $15 +; CHECK-NEXT: i32.div_s $push12=, $pop10, $pop11 +; CHECK-NEXT: i32.div_s $push13=, $pop9, $pop12 +; CHECK-NEXT: i32.div_s $push14=, $pop6, $pop13 +; CHECK-NEXT: return $pop14 +define i32 @div_tree(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p) { +entry: + %div = sdiv i32 %a, %b + %div1 = sdiv i32 %c, %d + %div2 = sdiv i32 %div, %div1 + %div3 = sdiv i32 %e, %f + %div4 = sdiv i32 %g, %h + %div5 = sdiv i32 %div3, %div4 + %div6 = sdiv i32 %div2, %div5 + %div7 = sdiv i32 %i, %j + %div8 = sdiv i32 %k, %l + %div9 = sdiv i32 %div7, %div8 + %div10 = sdiv i32 %m, %n + %div11 = sdiv i32 %o, %p + %div12 = sdiv i32 %div10, %div11 + %div13 = sdiv i32 %div9, %div12 + %div14 = sdiv i32 %div6, %div13 + ret i32 %div14 +} + +; A simple multiple-use case. + +; CHECK-LABEL: simple_multiple_use: +; CHECK-NEXT: .param i32, i32{{$}} +; CHECK-NEXT: i32.mul $push0=, $1, $0{{$}} +; CHECK-NEXT: tee_local $push1=, $0=, $pop0{{$}} +; CHECK-NEXT: call use_a@FUNCTION, $pop1{{$}} +; CHECK-NEXT: call use_b@FUNCTION, $0{{$}} +; CHECK-NEXT: return{{$}} +declare void @use_a(i32) +declare void @use_b(i32) +define void @simple_multiple_use(i32 %x, i32 %y) { + %mul = mul i32 %y, %x + call void @use_a(i32 %mul) + call void @use_b(i32 %mul) + ret void +} + +; Multiple uses of the same value in one instruction. + +; CHECK-LABEL: multiple_uses_in_same_insn: +; CHECK-NEXT: .param i32, i32{{$}} +; CHECK-NEXT: i32.mul $push0=, $1, $0{{$}} +; CHECK-NEXT: tee_local $push1=, $0=, $pop0{{$}} +; CHECK-NEXT: call use_2@FUNCTION, $pop1, $0{{$}} +; CHECK-NEXT: return{{$}} +declare void @use_2(i32, i32) +define void @multiple_uses_in_same_insn(i32 %x, i32 %y) { + %mul = mul i32 %y, %x + call void @use_2(i32 %mul, i32 %mul) + ret void +} + +; Commute operands to achieve better stackifying. + +; CHECK-LABEL: commute: +; CHECK-NEXT: .result i32{{$}} +; CHECK-NEXT: i32.call $push0=, red@FUNCTION{{$}} +; CHECK-NEXT: i32.call $push1=, green@FUNCTION{{$}} +; CHECK-NEXT: i32.add $push2=, $pop0, $pop1{{$}} +; CHECK-NEXT: i32.call $push3=, blue@FUNCTION{{$}} +; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}} +; CHECK-NEXT: return $pop4{{$}} +declare i32 @red() +declare i32 @green() +declare i32 @blue() +define i32 @commute() { + %call = call i32 @red() + %call1 = call i32 @green() + %add = add i32 %call1, %call + %call2 = call i32 @blue() + %add3 = add i32 %add, %call2 + ret i32 %add3 +} + !0 = !{} diff --git a/test/CodeGen/WebAssembly/varargs.ll b/test/CodeGen/WebAssembly/varargs.ll index bece022823b..c5061e825a6 100644 --- a/test/CodeGen/WebAssembly/varargs.ll +++ b/test/CodeGen/WebAssembly/varargs.ll @@ -49,12 +49,13 @@ entry: ; CHECK-NEXT: .param i32{{$}} ; CHECK-NEXT: .result i32{{$}} ; CHECK-NEXT: .local i32{{$}} -; CHECK-NEXT: i32.load $1=, 0($0){{$}} -; CHECK-NEXT: i32.const $push0=, 4{{$}} -; CHECK-NEXT: i32.add $push1=, $1, $pop0{{$}} -; CHECK-NEXT: i32.store $discard=, 0($0), $pop1{{$}} -; CHECK-NEXT: i32.load $push2=, 0($1){{$}} -; CHECK-NEXT: return $pop2{{$}} +; CHECK-NEXT: i32.load $push0=, 0($0){{$}} +; CHECK-NEXT: tee_local $push4=, $1=, $pop0{{$}} +; CHECK-NEXT: i32.const $push1=, 4{{$}} +; CHECK-NEXT: i32.add $push2=, $pop4, $pop1{{$}} +; CHECK-NEXT: i32.store $discard=, 0($0), $pop2{{$}} +; CHECK-NEXT: i32.load $push3=, 0($1){{$}} +; CHECK-NEXT: return $pop3{{$}} define i8 @arg_i8(i8** %ap) { entry: %t = va_arg i8** %ap, i8 @@ -71,12 +72,13 @@ entry: ; CHECK-NEXT: i32.const $push1=, 3{{$}} ; CHECK-NEXT: i32.add $push2=, $pop0, $pop1{{$}} ; CHECK-NEXT: i32.const $push3=, -4{{$}} -; CHECK-NEXT: i32.and $1=, $pop2, $pop3{{$}} -; CHECK-NEXT: i32.const $push4=, 4{{$}} -; CHECK-NEXT: i32.add $push5=, $1, $pop4{{$}} -; CHECK-NEXT: i32.store $discard=, 0($0), $pop5{{$}} -; CHECK-NEXT: i32.load $push6=, 0($1){{$}} -; CHECK-NEXT: return $pop6{{$}} +; CHECK-NEXT: i32.and $push4=, $pop2, $pop3{{$}} +; CHECK-NEXT: tee_local $push8=, $1=, $pop4{{$}} +; CHECK-NEXT: i32.const $push5=, 4{{$}} +; CHECK-NEXT: i32.add $push6=, $pop8, $pop5{{$}} +; CHECK-NEXT: i32.store $discard=, 0($0), $pop6{{$}} +; CHECK-NEXT: i32.load $push7=, 0($1){{$}} +; CHECK-NEXT: return $pop7{{$}} define i32 @arg_i32(i8** %ap) { entry: %t = va_arg i8** %ap, i32