[WebAssembly] Model the stack evaluation order more precisely.

We currently don't represent get_local and set_local explicitly; they are just implied by virtual register use and def. This avoids a lot of clutter, but it does complicate stackifying: get_locals read their operands at their position in the stack evaluation order, rather than at their parent instruction. This patch adds code to walk the stack to determine the precise ordering, when needed. llvm-svn: 269854
2024-10-19 19:12:56 +02:00 · 2016-05-17 22:24:18 +00:00 · 2016-05-17 22:24:18 +00:00 · df1ae0caf4
commit df1ae0caf4
parent 6eb7829d59
3 changed files with 73 additions and 32 deletions
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@ -163,10 +163,12 @@ static void Query(const MachineInstr *MI, AliasAnalysis &AA,
      // of memoperands as having a potential unknown memory reference.
      break;
    default:
-      // Record potential stores, unless it's a call, as calls are handled
+      // Record volatile accesses, unless it's a call, as calls are handled
      // specially below.
-      if (!MI->isCall())
+      if (!MI->isCall()) {
        Write = true;
+        Effects = true;
+      }
      break;
    }
  }
@ -197,24 +199,15 @@ static void Query(const MachineInstr *MI, AliasAnalysis &AA,
  if (MI->isCall()) {
    switch (MI->getOpcode()) {
    case WebAssembly::CALL_VOID:
+    case WebAssembly::CALL_INDIRECT_VOID:
      QueryCallee(MI, 0, Read, Write, Effects, StackPointer);
      break;
-    case WebAssembly::CALL_I32:
-    case WebAssembly::CALL_I64:
-    case WebAssembly::CALL_F32:
-    case WebAssembly::CALL_F64:
+    case WebAssembly::CALL_I32: case WebAssembly::CALL_I64:
+    case WebAssembly::CALL_F32: case WebAssembly::CALL_F64:
+    case WebAssembly::CALL_INDIRECT_I32: case WebAssembly::CALL_INDIRECT_I64:
+    case WebAssembly::CALL_INDIRECT_F32: case WebAssembly::CALL_INDIRECT_F64:
      QueryCallee(MI, 1, Read, Write, Effects, StackPointer);
      break;
-    case WebAssembly::CALL_INDIRECT_VOID:
-    case WebAssembly::CALL_INDIRECT_I32:
-    case WebAssembly::CALL_INDIRECT_I64:
-    case WebAssembly::CALL_INDIRECT_F32:
-    case WebAssembly::CALL_INDIRECT_F64:
-      Read = true;
-      Write = true;
-      Effects = true;
-      StackPointer = true;
-      break;
    default:
      llvm_unreachable("unexpected call opcode");
    }
@ -360,7 +353,8 @@ static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
                                     const MachineBasicBlock &MBB,
                                     const MachineRegisterInfo &MRI,
                                     const MachineDominatorTree &MDT,
-                                     LiveIntervals &LIS) {
+                                     LiveIntervals &LIS,
+                                     WebAssemblyFunctionInfo &MFI) {
  const LiveInterval &LI = LIS.getInterval(Reg);

  const MachineInstr *OneUseInst = OneUse.getParent();
@ -384,8 +378,31 @@ static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
        return false;
    } else {
      // Test that the use is dominated by the one selected use.
-      if (!MDT.dominates(OneUseInst, UseInst))
-        return false;
+      while (!MDT.dominates(OneUseInst, UseInst)) {
+        // Actually, dominating is over-conservative. Test that the use would
+        // happen after the one selected use in the stack evaluation order.
+        //
+        // This is needed as a consequence of using implicit get_locals for
+        // uses and implicit set_locals for defs.
+        if (UseInst->getDesc().getNumDefs() == 0) 
+          return false;
+        const MachineOperand &MO = UseInst->getOperand(0);
+        if (!MO.isReg())
+          return false;
+        unsigned DefReg = MO.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(DefReg) ||
+            !MFI.isVRegStackified(DefReg))
+          return false;
+        assert(MRI.hasOneUse(DefReg));
+        const MachineOperand &NewUse = *MRI.use_begin(DefReg);
+        const MachineInstr *NewUseInst = NewUse.getParent();
+        if (NewUseInst == OneUseInst) {
+          if (&OneUse > &NewUse)
+            return false;
+          break;
+        }
+        UseInst = NewUseInst;
+      }
    }
  }
  return true;
@ -619,6 +636,9 @@ public:
  /// Test whether the given register is present on the stack, indicating an
  /// operand in the tree that we haven't visited yet. Moving a definition of
  /// Reg to a point in the tree after that would change its value.
+  ///
+  /// This is needed as a consequence of using implicit get_locals for
+  /// uses and implicit set_locals for defs.
  bool IsOnStack(unsigned Reg) const {
    for (const RangeTy &Range : Worklist)
      for (const MachineOperand &MO : Range)
@ -763,7 +783,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
          Insert = RematerializeCheapDef(Reg, Op, Def, MBB, Insert, LIS, MFI,
                                         MRI, TII, TRI);
        } else if (CanMove &&
-                   OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS)) {
+                   OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {
          Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
                                         MRI, TII);
        } else {
--- a/test/CodeGen/WebAssembly/reg-stackify.ll
+++ b/test/CodeGen/WebAssembly/reg-stackify.ll
@ -281,15 +281,36 @@ define i32 @commute() {
 ; an implicit get_local for the register.

 ; CHECK-LABEL: no_stackify_past_use:
-; CHECK: i32.call        $1=, callee@FUNCTION, $0
+; CHECK:      i32.call        $1=, callee@FUNCTION, $0
+; CHECK-NEXT: i32.const       $push0=, 1
+; CHECK-NEXT: i32.add         $push1=, $0, $pop0
+; CHECK-NEXT: i32.call        $push2=, callee@FUNCTION, $pop1
+; CHECK-NEXT: i32.sub         $push3=, $pop2, $1
+; CHECK-NEXT: i32.div_s       $push4=, $pop3, $1
+; CHECK-NEXT: return          $pop4
+declare i32 @callee(i32)
+define i32 @no_stackify_past_use(i32 %arg) {
+  %tmp1 = call i32 @callee(i32 %arg)
+  %tmp2 = add i32 %arg, 1
+  %tmp3 = call i32 @callee(i32 %tmp2)
+  %tmp5 = sub i32 %tmp3, %tmp1
+  %tmp6 = sdiv i32 %tmp5, %tmp1
+  ret i32 %tmp6
+}
+
+; This is the same as no_stackify_past_use, except using a commutative operator,
+; so we can reorder the operands and stackify.
+
+; CHECK-LABEL: commute_to_fix_ordering:
+; CHECK: i32.call        $push[[L0:.+]]=, callee@FUNCTION, $0
+; CHECK: tee_local       $push[[L1:.+]]=, $1=, $pop[[L0]]
 ; CHECK: i32.const       $push0=, 1
 ; CHECK: i32.add         $push1=, $0, $pop0
 ; CHECK: i32.call        $push2=, callee@FUNCTION, $pop1
 ; CHECK: i32.add         $push3=, $1, $pop2
-; CHECK: i32.mul         $push4=, $1, $pop3
+; CHECK: i32.mul         $push4=, $pop[[L1]], $pop3
 ; CHECK: return          $pop4
-declare i32 @callee(i32)
-define i32 @no_stackify_past_use(i32 %arg) {
+define i32 @commute_to_fix_ordering(i32 %arg) {
  %tmp1 = call i32 @callee(i32 %arg)
  %tmp2 = add i32 %arg, 1
  %tmp3 = call i32 @callee(i32 %tmp2)
--- a/test/CodeGen/WebAssembly/userstack.ll
+++ b/test/CodeGen/WebAssembly/userstack.ll
@ -56,14 +56,14 @@ define void @allocarray() {
 ; CHECK-NEXT: i32.load $push[[L5:.+]]=, 0($pop[[L4]])
 ; CHECK-NEXT: i32.const $push[[L6:.+]]=, 144{{$}}
 ; CHECK-NEXT: i32.sub $push[[L11:.+]]=, $pop[[L5]], $pop[[L6]]
- ; CHECK-NEXT: i32.store $[[SP:.+]]=, 0($pop[[L7]]), $pop[[L11]]
+ ; CHECK-NEXT: i32.store ${{.+}}=, 0($pop[[L7]]), $pop[[L11]]
 %r = alloca [33 x i32]

- ; CHECK-NEXT: i32.const $push[[L2:.+]]=, 24
- ; CHECK-NEXT: i32.add $push[[L3:.+]]=, $[[SP]], $pop[[L2]]
+ ; CHECK:      i32.const $push{{.+}}=, 24
+ ; CHECK-NEXT: i32.add $push[[L3:.+]]=, $[[SP]], $pop{{.+}}
 ; CHECK-NEXT: i32.const $push[[L1:.+]]=, 1{{$}}
 ; CHECK-NEXT: i32.store $push[[L0:.+]]=, 0($pop[[L3]]), $pop[[L1]]{{$}}
- ; CHECK-NEXT: i32.store $discard=, 12($[[SP]]), $pop[[L0]]{{$}}
+ ; CHECK-NEXT: i32.store $discard=, 12(${{.+}}), $pop[[L0]]{{$}}
 %p = getelementptr [33 x i32], [33 x i32]* %r, i32 0, i32 0
 store i32 1, i32* %p
 %p2 = getelementptr [33 x i32], [33 x i32]* %r, i32 0, i32 3
@ -111,20 +111,20 @@ define void @allocarray_inbounds() {
 ; CHECK-NEXT: i32.load $push[[L4:.+]]=, 0($pop[[L3]])
 ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 32{{$}}
 ; CHECK-NEXT: i32.sub $push[[L10:.+]]=, $pop[[L4]], $pop[[L5]]
- ; CHECK-NEXT: i32.store $[[SP:.+]]=, 0($pop[[L6]]), $pop[[L10]]{{$}}
+ ; CHECK-NEXT: i32.store ${{.+}}=, 0($pop[[L6]]), $pop[[L10]]{{$}}
 %r = alloca [5 x i32]
 ; CHECK: i32.const $push[[L3:.+]]=, 1
- ; CHECK: i32.store {{.*}}=, 12($[[SP]]), $pop[[L3]]
+ ; CHECK: i32.store {{.*}}=, 12(${{.+}}), $pop[[L3]]
 %p = getelementptr inbounds [5 x i32], [5 x i32]* %r, i32 0, i32 0
 store i32 1, i32* %p
 ; This store should have both the GEP and the FI folded into it.
- ; CHECK-NEXT: i32.store {{.*}}=, 24($[[SP]]), $pop
+ ; CHECK-NEXT: i32.store {{.*}}=, 24(${{.+}}), $pop
 %p2 = getelementptr inbounds [5 x i32], [5 x i32]* %r, i32 0, i32 3
 store i32 1, i32* %p2
 call void @ext_func(i64* null);
 ; CHECK: i32.const $push[[L6:.+]]=, __stack_pointer
 ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 32
- ; CHECK-NEXT: i32.add $push[[L7:.+]]=, $[[SP]], $pop[[L5]]
+ ; CHECK-NEXT: i32.add $push[[L7:.+]]=, ${{.+}}, $pop[[L5]]
 ; CHECK-NEXT: i32.store $discard=, 0($pop[[L6]]), $pop[[L7]]
 ret void
 }