[ms] [llvm-ml] Add initial MASM STRUCT/UNION support

Summary: Add support for user-defined types to MasmParser, including initialization and field access. Known issues: - Omitted entry initializers (e.g., <,0>) do not work consistently for nested structs/arrays. - Size checking/inference for values with known types is not yet implemented. - Some ml64.exe syntaxes for accessing STRUCT fields are not recognized. - `[<register>.<struct name>].<field>` - `[<register>[<struct name>.<field>]]` - `(<struct name> PTR [<register>]).<field>` - `[<variable>.<struct name>].<field>` - `(<struct name> PTR <variable>).<field>` Reviewed By: thakis Differential Revision: https://reviews.llvm.org/D75306
2024-11-22 02:33:06 +01:00 · 2020-07-07 17:01:10 -04:00 · 2020-07-07 17:01:10 -04:00 · 164df8d6f0
commit 164df8d6f0
parent 5c6ba2c07d
6 changed files with 1649 additions and 236 deletions
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@ -170,6 +170,11 @@ public:

  virtual bool isParsingMasm() const { return false; }

+  virtual bool LookUpFieldOffset(StringRef Base, StringRef Member,
+                                 unsigned &Offset) {
+    return true;
+  }
+
  /// Parse MS-style inline assembly.
  virtual bool parseMSInlineAsm(
      void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
--- a/include/llvm/MC/MCParser/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCParser/MCTargetAsmParser.h
@ -334,7 +334,7 @@ protected: // Can only create subclasses.

  /// SemaCallback - The Sema callback implementation.  Must be set when parsing
  /// ms-style inline assembly.
-  MCAsmParserSemaCallback *SemaCallback;
+  MCAsmParserSemaCallback *SemaCallback = nullptr;

  /// Set of options which affects instrumentation of inline assembly.
  MCTargetOptions MCOptions;
--- a/lib/MC/MCParser/MasmParser.cpp
+++ b/lib/MC/MCParser/MasmParser.cpp
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@ -864,6 +864,8 @@ private:
    return nullptr;
  }

+  bool MatchRegisterByName(unsigned &RegNo, StringRef RegName, SMLoc StartLoc,
+                           SMLoc EndLoc);
  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
                     bool RestoreOnFailure);

@ -1145,6 +1147,108 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
  return checkScale(Scale, ErrMsg);
 }

+bool X86AsmParser::MatchRegisterByName(unsigned &RegNo, StringRef RegName,
+                                       SMLoc StartLoc, SMLoc EndLoc) {
+  // If we encounter a %, ignore it. This code handles registers with and
+  // without the prefix, unprefixed registers can occur in cfi directives.
+  RegName.consume_front("%");
+
+  RegNo = MatchRegisterName(RegName);
+
+  // If the match failed, try the register name as lowercase.
+  if (RegNo == 0)
+    RegNo = MatchRegisterName(RegName.lower());
+
+  // The "flags" and "mxcsr" registers cannot be referenced directly.
+  // Treat it as an identifier instead.
+  if (isParsingMSInlineAsm() && isParsingIntelSyntax() &&
+      (RegNo == X86::EFLAGS || RegNo == X86::MXCSR))
+    RegNo = 0;
+
+  if (!is64BitMode()) {
+    // FIXME: This should be done using Requires<Not64BitMode> and
+    // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
+    // checked.
+    // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
+    // REX prefix.
+    if (RegNo == X86::RIZ || RegNo == X86::RIP ||
+        X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
+        X86II::isX86_64NonExtLowByteReg(RegNo) ||
+        X86II::isX86_64ExtendedReg(RegNo)) {
+      return Error(StartLoc,
+                   "register %" + RegName + " is only available in 64-bit mode",
+                   SMRange(StartLoc, EndLoc));
+    }
+  }
+
+  // If this is "db[0-15]", match it as an alias
+  // for dr[0-15].
+  if (RegNo == 0 && RegName.startswith("db")) {
+    if (RegName.size() == 3) {
+      switch (RegName[2]) {
+      case '0':
+        RegNo = X86::DR0;
+        break;
+      case '1':
+        RegNo = X86::DR1;
+        break;
+      case '2':
+        RegNo = X86::DR2;
+        break;
+      case '3':
+        RegNo = X86::DR3;
+        break;
+      case '4':
+        RegNo = X86::DR4;
+        break;
+      case '5':
+        RegNo = X86::DR5;
+        break;
+      case '6':
+        RegNo = X86::DR6;
+        break;
+      case '7':
+        RegNo = X86::DR7;
+        break;
+      case '8':
+        RegNo = X86::DR8;
+        break;
+      case '9':
+        RegNo = X86::DR9;
+        break;
+      }
+    } else if (RegName.size() == 4 && RegName[2] == '1') {
+      switch (RegName[3]) {
+      case '0':
+        RegNo = X86::DR10;
+        break;
+      case '1':
+        RegNo = X86::DR11;
+        break;
+      case '2':
+        RegNo = X86::DR12;
+        break;
+      case '3':
+        RegNo = X86::DR13;
+        break;
+      case '4':
+        RegNo = X86::DR14;
+        break;
+      case '5':
+        RegNo = X86::DR15;
+        break;
+      }
+    }
+  }
+
+  if (RegNo == 0) {
+    if (isParsingIntelSyntax())
+      return true;
+    return Error(StartLoc, "invalid register name", SMRange(StartLoc, EndLoc));
+  }
+  return false;
+}
+
 bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                 SMLoc &EndLoc, bool RestoreOnFailure) {
  MCAsmParser &Parser = getParser();
@ -1180,37 +1284,9 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                 SMRange(StartLoc, EndLoc));
  }

-  RegNo = MatchRegisterName(Tok.getString());
-
-  // If the match failed, try the register name as lowercase.
-  if (RegNo == 0)
-    RegNo = MatchRegisterName(Tok.getString().lower());
-
-  // The "flags" and "mxcsr" registers cannot be referenced directly.
-  // Treat it as an identifier instead.
-  if (isParsingMSInlineAsm() && isParsingIntelSyntax() &&
-      (RegNo == X86::EFLAGS || RegNo == X86::MXCSR))
-    RegNo = 0;
-
-  if (!is64BitMode()) {
-    // FIXME: This should be done using Requires<Not64BitMode> and
-    // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
-    // checked.
-    // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
-    // REX prefix.
-    if (RegNo == X86::RIZ || RegNo == X86::RIP ||
-        X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
-        X86II::isX86_64NonExtLowByteReg(RegNo) ||
-        X86II::isX86_64ExtendedReg(RegNo)) {
-      StringRef RegName = Tok.getString();
-      OnFailure();
-      if (!RestoreOnFailure) {
-        Parser.Lex(); // Eat register name.
-      }
-      return Error(StartLoc,
-                   "register %" + RegName + " is only available in 64-bit mode",
-                   SMRange(StartLoc, EndLoc));
-    }
+  if (MatchRegisterByName(RegNo, Tok.getString(), StartLoc, EndLoc)) {
+    OnFailure();
+    return true;
  }

  // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
@ -1259,40 +1335,6 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,

  EndLoc = Parser.getTok().getEndLoc();

-  // If this is "db[0-15]", match it as an alias
-  // for dr[0-15].
-  if (RegNo == 0 && Tok.getString().startswith("db")) {
-    if (Tok.getString().size() == 3) {
-      switch (Tok.getString()[2]) {
-      case '0': RegNo = X86::DR0; break;
-      case '1': RegNo = X86::DR1; break;
-      case '2': RegNo = X86::DR2; break;
-      case '3': RegNo = X86::DR3; break;
-      case '4': RegNo = X86::DR4; break;
-      case '5': RegNo = X86::DR5; break;
-      case '6': RegNo = X86::DR6; break;
-      case '7': RegNo = X86::DR7; break;
-      case '8': RegNo = X86::DR8; break;
-      case '9': RegNo = X86::DR9; break;
-      }
-    } else if (Tok.getString().size() == 4 && Tok.getString()[2] == '1') {
-      switch (Tok.getString()[3]) {
-      case '0': RegNo = X86::DR10; break;
-      case '1': RegNo = X86::DR11; break;
-      case '2': RegNo = X86::DR12; break;
-      case '3': RegNo = X86::DR13; break;
-      case '4': RegNo = X86::DR14; break;
-      case '5': RegNo = X86::DR15; break;
-      }
-    }
-
-    if (RegNo != 0) {
-      EndLoc = Parser.getTok().getEndLoc();
-      Parser.Lex(); // Eat it.
-      return false;
-    }
-  }
-
  if (RegNo == 0) {
    OnFailure();
    if (isParsingIntelSyntax()) return true;
@ -1590,12 +1632,41 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
      SMLoc IdentLoc = Tok.getLoc();
      StringRef Identifier = Tok.getString();
      UpdateLocLex = false;
-      // Register
+      // Register, or (MASM only) <register>.<field>
      unsigned Reg;
-      if (Tok.is(AsmToken::Identifier) && !ParseRegister(Reg, IdentLoc, End)) {
-        if (SM.onRegister(Reg, ErrMsg))
-          return Error(Tok.getLoc(), ErrMsg);
-        break;
+      if (Tok.is(AsmToken::Identifier)) {
+        if (!ParseRegister(Reg, IdentLoc, End, /*RestoreOnFailure=*/true)) {
+          if (SM.onRegister(Reg, ErrMsg))
+            return Error(IdentLoc, ErrMsg);
+          break;
+        }
+        if (Parser.isParsingMasm()) {
+          const std::pair<StringRef, StringRef> RegField =
+              Tok.getString().split('.');
+          const StringRef RegName = RegField.first, Field = RegField.second;
+          SMLoc RegEndLoc =
+              SMLoc::getFromPointer(RegName.data() + RegName.size());
+          if (!Field.empty() &&
+              !MatchRegisterByName(Reg, RegName, IdentLoc, RegEndLoc)) {
+            if (SM.onRegister(Reg, ErrMsg))
+              return Error(IdentLoc, ErrMsg);
+
+            SMLoc FieldStartLoc = SMLoc::getFromPointer(Field.data());
+            const std::pair<StringRef, StringRef> BaseMember = Field.split('.');
+            const StringRef Base = BaseMember.first, Member = BaseMember.second;
+
+            unsigned Offset;
+            if (Parser.LookUpFieldOffset(Base, Member, Offset))
+              return Error(FieldStartLoc, "unknown offset");
+            else if (SM.onPlus(ErrMsg))
+              return Error(getTok().getLoc(), ErrMsg);
+            else if (SM.onInteger(Offset, ErrMsg))
+              return Error(IdentLoc, ErrMsg);
+
+            End = consumeToken();
+            break;
+          }
+        }
      }
      // Operator synonymous ("not", "or" etc.)
      bool ParseError = false;
@ -1607,37 +1678,39 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
      // Symbol reference, when parsing assembly content
      InlineAsmIdentifierInfo Info;
      const MCExpr *Val;
-      if (!isParsingMSInlineAsm()) {
-        if (getParser().parsePrimaryExpr(Val, End)) {
-          return Error(Tok.getLoc(), "Unexpected identifier!");
-        } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) {
-          return Error(IdentLoc, ErrMsg);
-        } else
+      if (isParsingMSInlineAsm() || Parser.isParsingMasm()) {
+        // MS Dot Operator expression
+        if (Identifier.count('.') && PrevTK == AsmToken::RBrac) {
+          if (ParseIntelDotOperator(SM, End))
+            return true;
          break;
+        }
      }
-      // MS InlineAsm operators (TYPE/LENGTH/SIZE)
-      if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) {
-        if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
-          if (SM.onInteger(Val, ErrMsg))
-            return Error(IdentLoc, ErrMsg);
-        } else
+      if (isParsingMSInlineAsm()) {
+        // MS InlineAsm operators (TYPE/LENGTH/SIZE)
+        if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) {
+          if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
+            if (SM.onInteger(Val, ErrMsg))
+              return Error(IdentLoc, ErrMsg);
+          } else
+            return true;
+          break;
+        }
+        // MS InlineAsm identifier
+        // Call parseIdentifier() to combine @ with the identifier behind it.
+        if (TK == AsmToken::At && Parser.parseIdentifier(Identifier))
+          return Error(IdentLoc, "expected identifier");
+        if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
          return true;
+        else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg))
+          return Error(IdentLoc, ErrMsg);
        break;
      }
-      // MS Dot Operator expression
-      if (Identifier.count('.') && PrevTK == AsmToken::RBrac) {
-        if (ParseIntelDotOperator(SM, End))
-          return true;
-        break;
-      }
-      // MS InlineAsm identifier
-      // Call parseIdentifier() to combine @ with the identifier behind it.
-      if (TK == AsmToken::At && Parser.parseIdentifier(Identifier))
-        return Error(IdentLoc, "expected identifier");
-      if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
-        return true;
-      else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg))
+      if (getParser().parsePrimaryExpr(Val, End)) {
+        return Error(Tok.getLoc(), "Unexpected identifier!");
+      } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) {
        return Error(IdentLoc, ErrMsg);
+      }
      break;
    }
    case AsmToken::Integer: {
@ -1856,10 +1929,14 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End)
    APInt DotDisp;
    DotDispStr.getAsInteger(10, DotDisp);
    Offset = DotDisp.getZExtValue();
-  } else if (isParsingMSInlineAsm() && Tok.is(AsmToken::Identifier)) {
-    std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
-    if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
-                                           Offset))
+  } else if ((isParsingMSInlineAsm() || getParser().isParsingMasm()) &&
+             Tok.is(AsmToken::Identifier)) {
+    const std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+    const StringRef Base = BaseMember.first, Member = BaseMember.second;
+    if (getParser().LookUpFieldOffset(SM.getSymName(), DotDispStr, Offset) &&
+        getParser().LookUpFieldOffset(Base, Member, Offset) &&
+        (!SemaCallback ||
+         SemaCallback->LookupInlineAsmField(Base, Member, Offset)))
      return Error(Tok.getLoc(), "Unable to lookup field reference!");
  } else
    return Error(Tok.getLoc(), "Unexpected token type!");
--- a/test/tools/llvm-ml/struct.test
+++ b/test/tools/llvm-ml/struct.test
@ -0,0 +1,104 @@
+# RUN: llvm-ml -filetype=asm %s | FileCheck %s
+
+.data
+BAZ STRUCT
+  a BYTE 1
+  b BYTE 2
+BAZ ENDS
+
+FOOBAR struct 2
+  c BYTE 3 DUP (4)
+  d DWORD 5
+  e BAZ <>
+  STRUCT f
+    g BYTE 6
+    h BYTE 7
+  ends
+  h BYTE "abcde"
+foobar ENDS
+
+t1 foobar <>
+
+; CHECK: t1:
+;
+; BYTE 3 DUP (4), plus alignment padding
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .zero 1
+;
+; DWORD 5
+; CHECK-NEXT: .long 5
+;
+; BAZ <>
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 2
+;
+; <BYTE 6, BYTE 7>, with internal alignment padding
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .zero 1
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .zero 1
+;
+; BYTE "abcde", plus alignment padding
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NEXT: .byte 101
+; CHECK-NEXT: .zero 1
+
+t2 FOOBAR <"gh",,<10,11>,<12>,"ijk">
+
+; CHECK: t2:
+;
+; BYTE "gh", padded with " ", plus alignment padding
+; CHECK-NEXT: .byte 103
+; CHECK-NEXT: .byte 104
+; CHECK-NEXT: .byte 32
+; CHECK-NEXT: .zero 1
+;
+; DWORD 5 (default-initialized when omitted)
+; CHECK-NEXT: .long 5
+;
+; BAZ <10, 11>
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .byte 11
+;
+; <BYTE 6, BYTE 7>, with internal alignment padding
+; CHECK-NEXT: .byte 12
+; CHECK-NEXT: .zero 1
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .zero 1
+;
+; BYTE "ijk", padded with " ", plus alignment padding
+; CHECK-NEXT: .byte 105
+; CHECK-NEXT: .byte 106
+; CHECK-NEXT: .byte 107
+; CHECK-NEXT: .byte 32
+; CHECK-NEXT: .byte 32
+; CHECK-NEXT: .zero 1
+
+.code
+
+t3:
+mov eax, t2.f.h
+mov eax, [t2].f.h
+mov eax, [t2.f.h]
+mov eax, t2.FOOBAR.f.h
+
+; CHECK: t3:
+; CHECK-NEXT: mov eax, dword ptr [rip + t2+12]
+; CHECK-NEXT: mov eax, dword ptr [rip + t2+12]
+; CHECK-NEXT: mov eax, dword ptr [rip + t2+12]
+; CHECK-NEXT: mov eax, dword ptr [rip + t2+12]
+
+t4:
+mov eax, j.FOOBAR.f.h
+mov eax, j.baz.b
+
+; CHECK: t4:
+; CHECK-NEXT: mov eax, dword ptr [rip + j+12]
+; CHECK-NEXT: mov eax, dword ptr [rip + j+1]
+
+END
--- a/test/tools/llvm-ml/struct_errors.test
+++ b/test/tools/llvm-ml/struct_errors.test
@ -0,0 +1,57 @@
+# RUN: not llvm-ml -filetype=asm %s 2>&1 | FileCheck %s --dump-input=always
+
+.data
+int_test STRUCT
+  int_arr DWORD ?, ?
+  int_scalar DWORD ?
+int_test ENDS
+
+t1 int_test <<1,2,3>>
+// CHECK: error: Initializer too long for field; expected at most 2 elements, got 3
+
+t2 int_test <4>
+// CHECK: error: Cannot initialize array field with scalar value
+
+t3 int_test <,<5,6>>
+// CHECK: error: Cannot initialize scalar field with array value
+
+real_test STRUCT
+  real_arr REAL4 ?, ?, ?
+  real_scalar REAL4 ?
+real_test ENDS
+
+t4 real_test <<1.0,0.0,-1.0,-2.0>>
+// CHECK: error: Initializer too long for field; expected at most 3 elements, got 4
+
+t5 real_test <2.0>
+// CHECK: error: Cannot initialize array field with scalar value
+
+t6 real_test <,<2.0,-2.0>>
+// CHECK: error: Cannot initialize scalar field with array value
+
+inner_struct STRUCT
+  a BYTE ?
+inner_struct ENDS
+
+struct_test STRUCT
+  struct_arr inner_struct 4 DUP (?)
+  struct_scalar inner_struct ?
+struct_test ENDS
+
+t7 struct_test <<<>, <>, <>, <>, <>>>
+// CHECK: error: Initializer too long for field; expected at most 4 elements, got 5
+
+t8 struct_test <,<<>, <>>>
+// CHECK: error: 'inner_struct' initializer initializes too many fields
+
+t9 STRUCT 3
+// CHECK: error: alignment must be a power of two; was 3
+t9 ENDS
+
+t10 STRUCT 1, X
+// CHECK: error: Unrecognized qualifier for 'STRUCT' directive; expected none or NONUNIQUE
+t10 ENDS
+
+t11 STRUCT
+different_struct ENDS
+// CHECK: error: mismatched name in ENDS directive; expected 't11'