[AsmParser][SystemZ][z/OS] Add in support to accept "#" as part of an Identifier token

- This patch adds in support to accept the "#" character as part of an Identifier. - This support is needed especially for the HLASM dialect since "#" is treated as part of the valid "Alphabet" range - The way this is done is by making use of the previous precedent set by the `AllowAtInIdentifier` field in `MCAsmLexer.h`. A new field called `AllowHashInIdentifier` is introduced. - The static function `IsIdentifierChar` is also updated to accept the `#` character if the `AllowHashInIdentifier` field is set to true. Note: The field introduced in `MCAsmLexer.h` could very well be moved to `MCAsmInfo.h`. I'm not opposed to it. I decided to put it in `MCAsmLexer` since there seems to be some sort of precedent already with `AllowAtInIdentifier`. Reviewed By: abhina.sreeskantharajan, nickdesaulniers, MaskRay Differential Revision: https://reviews.llvm.org/D99277
2024-11-22 02:33:06 +01:00 · 2021-04-01 10:38:42 -04:00 · 2021-04-01 10:38:42 -04:00 · 388899404f
commit 388899404f
parent 70e309f48b
3 changed files with 76 additions and 10 deletions
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h
@ -48,6 +48,7 @@ protected: // Can only create subclasses.
  const char *TokStart = nullptr;
  bool SkipSpace = true;
  bool AllowAtInIdentifier;
+  bool AllowHashInIdentifier = false;
  bool IsAtStartOfStatement = true;
  bool LexMasmHexFloats = false;
  bool LexMasmIntegers = false;
@ -147,6 +148,8 @@ public:
  bool getAllowAtInIdentifier() { return AllowAtInIdentifier; }
  void setAllowAtInIdentifier(bool v) { AllowAtInIdentifier = v; }

+  void setAllowHashInIdentifier(bool V) { AllowHashInIdentifier = V; }
+
  void setCommentConsumer(AsmCommentConsumer *CommentConsumer) {
    this->CommentConsumer = CommentConsumer;
  }
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@ -143,10 +143,10 @@ AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
  return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
 }

-/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
-static bool IsIdentifierChar(char c, bool AllowAt) {
-  return isAlnum(c) || c == '_' || c == '$' || c == '.' ||
-         (c == '@' && AllowAt) || c == '?';
+/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@#?]*
+static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
+  return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
+         (AllowAt && C == '@') || (AllowHash && C == '#');
 }

 AsmToken AsmLexer::LexIdentifier() {
@ -156,12 +156,13 @@ AsmToken AsmLexer::LexIdentifier() {
    while (isDigit(*CurPtr))
      ++CurPtr;

-    if (!IsIdentifierChar(*CurPtr, AllowAtInIdentifier) ||
+    if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
+                          AllowHashInIdentifier) ||
        *CurPtr == 'e' || *CurPtr == 'E')
      return LexFloatLiteral();
  }

-  while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
+  while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
    ++CurPtr;

  // Handle . as a special case.
@ -726,9 +727,10 @@ AsmToken AsmLexer::LexToken() {
  switch (CurChar) {
  default:
    if (MAI.doesAllowSymbolAtNameStart()) {
-      // Handle Microsoft-style identifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@?]*
+      // Handle Microsoft-style identifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
      if (!isDigit(CurChar) &&
-          IsIdentifierChar(CurChar, MAI.doesAllowAtInName()))
+          isIdentifierChar(CurChar, MAI.doesAllowAtInName(),
+                           AllowHashInIdentifier))
        return LexIdentifier();
    } else {
      // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
--- a/unittests/MC/SystemZ/SystemZAsmLexerTest.cpp
+++ b/unittests/MC/SystemZ/SystemZAsmLexerTest.cpp
@ -94,8 +94,6 @@ protected:
    Str.reset(TheTarget->createNullStreamer(*Ctx));

    Parser.reset(createMCAsmParser(SrcMgr, *Ctx, *Str, *MUPMAI));
-    // Lex initially to get the string.
-    Parser->getLexer().Lex();
  }

  void lexAndCheckTokens(StringRef AsmStr,
@ -116,6 +114,9 @@ TEST_F(SystemZAsmLexerTest, CheckDontRestrictCommentStringToStartOfStatement) {
  // Setup.
  setupCallToAsmParser(AsmStr);

+  // Lex initially to get the string.
+  Parser->getLexer().Lex();
+
  SmallVector<AsmToken::TokenKind> ExpectedTokens(
      {AsmToken::Identifier, AsmToken::EndOfStatement});
  lexAndCheckTokens(AsmStr /* "jne #-4" */, ExpectedTokens);
@ -129,6 +130,9 @@ TEST_F(SystemZAsmLexerTest, CheckRestrictCommentStringToStartOfStatement) {
  MUPMAI->setRestrictCommentStringToStartOfStatement(true);
  setupCallToAsmParser(AsmStr);

+  // Lex initially to get the string.
+  Parser->getLexer().Lex();
+
  // When we are restricting the comment string to only the start of the
  // statement, The sequence of tokens we are expecting are: Identifier - "jne"
  // Hash - '#'
@ -148,8 +152,65 @@ TEST_F(SystemZAsmLexerTest, CheckHLASMComment) {
  MUPMAI->setCommentString("*");
  setupCallToAsmParser(AsmStr);

+  // Lex initially to get the string.
+  Parser->getLexer().Lex();
+
  SmallVector<AsmToken::TokenKind> ExpectedTokens(
      {AsmToken::EndOfStatement, AsmToken::Eof});
  lexAndCheckTokens(AsmStr /* "* lhi 1,10" */, ExpectedTokens);
 }
+
+TEST_F(SystemZAsmLexerTest, CheckHashDefault) {
+  StringRef AsmStr = "lh#123";
+
+  // Setup.
+  setupCallToAsmParser(AsmStr);
+
+  // Lex initially to get the string.
+  Parser->getLexer().Lex();
+
+  // "lh" -> Identifier
+  // "#123" -> EndOfStatement (Lexed as a comment since CommentString is "#")
+  SmallVector<AsmToken::TokenKind> ExpectedTokens(
+      {AsmToken::Identifier, AsmToken::EndOfStatement, AsmToken::Eof});
+  lexAndCheckTokens(AsmStr, ExpectedTokens);
+}
+
+// Test if "#" is accepted as an Identifier
+TEST_F(SystemZAsmLexerTest, CheckAllowHashInIdentifier) {
+  StringRef AsmStr = "lh#123";
+
+  // Setup.
+  setupCallToAsmParser(AsmStr);
+  Parser->getLexer().setAllowHashInIdentifier(true);
+
+  // Lex initially to get the string.
+  Parser->getLexer().Lex();
+
+  // "lh123" -> Identifier
+  SmallVector<AsmToken::TokenKind> ExpectedTokens(
+      {AsmToken::Identifier, AsmToken::EndOfStatement, AsmToken::Eof});
+  lexAndCheckTokens(AsmStr, ExpectedTokens);
+}
+
+TEST_F(SystemZAsmLexerTest, CheckAllowHashInIdentifier2) {
+  StringRef AsmStr = "lh#12*3";
+
+  // Setup.
+  MUPMAI->setCommentString("*");
+  MUPMAI->setRestrictCommentStringToStartOfStatement(true);
+  setupCallToAsmParser(AsmStr);
+  Parser->getLexer().setAllowHashInIdentifier(true);
+
+  // Lex initially to get the string.
+  Parser->getLexer().Lex();
+
+  // "lh#12" -> Identifier
+  // "*" -> Star
+  // "3" -> Integer
+  SmallVector<AsmToken::TokenKind> ExpectedTokens(
+      {AsmToken::Identifier, AsmToken::Star, AsmToken::Integer,
+       AsmToken::EndOfStatement, AsmToken::Eof});
+  lexAndCheckTokens(AsmStr, ExpectedTokens);
+}
 } // end anonymous namespace