From a097bd08a15422797ef8487b23bfe5614fcbcbbb Mon Sep 17 00:00:00 2001 From: Luke Drummond Date: Wed, 15 Jan 2020 14:14:01 +0000 Subject: [PATCH] [tablegen] Emit string literals instead of char arrays This changes the generated (Instr|Asm|Reg|Regclass)Name tables from this form: extern const char HexagonInstrNameData[] = { /* 0 */ 'G', '_', 'F', 'L', 'O', 'G', '1', '0', 0, /* 9 */ 'E', 'N', 'D', 'L', 'O', 'O', 'P', '0', 0, /* 18 */ 'V', '6', '_', 'v', 'd', 'd', '0', 0, /* 26 */ 'P', 'S', '_', 'v', 'd', 'd', '0', 0, [...] }; ...to this: extern const char HexagonInstrNameData[] = { /* 0 */ "G_FLOG10\0" /* 9 */ "ENDLOOP0\0" /* 18 */ "V6_vdd0\0" /* 26 */ "PS_vdd0\0" [...] }; This should make debugging and exploration a lot easier for mortals, while providing a significant compile-time reduction for common compilers. To avoid issues with low implementation limits, this is disabled by default for visual studio. To force output one way or the other, pass `--long-string-literals=` to `tablegen` Reviewers: mstorsjo, rnk Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D73044 A variation of this patch was originally committed in ce23515f5ab011 and then reverted in e464b31c due to build failures. --- cmake/modules/TableGen.cmake | 8 ++ utils/TableGen/AsmWriterEmitter.cpp | 9 +-- utils/TableGen/InstrInfoEmitter.cpp | 5 +- utils/TableGen/RegisterInfoEmitter.cpp | 10 +-- utils/TableGen/SequenceToOffsetTable.h | 100 +++++++++++++++++++++---- utils/TableGen/TableGen.cpp | 6 ++ 6 files changed, 110 insertions(+), 28 deletions(-) diff --git a/cmake/modules/TableGen.cmake b/cmake/modules/TableGen.cmake index 9d2fcd9a793..632f69aa338 100644 --- a/cmake/modules/TableGen.cmake +++ b/cmake/modules/TableGen.cmake @@ -58,6 +58,14 @@ function(tablegen project ofn) endif() endif() + # MSVC can't support long string literals ("long" > 65534 bytes)[1], so if there's + # a possibility of generated tables being consumed by MSVC, generate arrays of + # char literals, instead. If we're cross-compiling, then conservatively assume + # that the source might be consumed by MSVC. + # [1] https://docs.microsoft.com/en-us/cpp/cpp/compiler-limits?view=vs-2017 + if (MSVC AND project STREQUAL LLVM) + list(APPEND LLVM_TABLEGEN_FLAGS "--long-string-literals=0") + endif() if (CMAKE_GENERATOR MATCHES "Visual Studio") # Visual Studio has problems with llvm-tblgen's native --write-if-changed # behavior. Since it doesn't do restat optimizations anyway, just don't diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp index 58c0d32d44e..c65a8ea390b 100644 --- a/utils/TableGen/AsmWriterEmitter.cpp +++ b/utils/TableGen/AsmWriterEmitter.cpp @@ -380,9 +380,7 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) { } // Emit the string table itself. - O << " static const char AsmStrs[] = {\n"; - StringTable.emit(O, printChar); - O << " };\n\n"; + StringTable.emitStringLiteralDef(O, " static const char AsmStrs[]"); // Emit the lookup tables in pieces to minimize wasted bytes. unsigned BytesNeeded = ((OpcodeInfoBits - BitsLeft) + 7) / 8; @@ -537,9 +535,8 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName, } StringTable.layout(); - O << " static const char AsmStrs" << AltName << "[] = {\n"; - StringTable.emit(O, printChar); - O << " };\n\n"; + StringTable.emitStringLiteralDef(O, Twine(" static const char AsmStrs") + + AltName + "[]"); O << " static const " << getMinimalTypeForRange(StringTable.size() - 1, 32) << " RegAsmOffset" << AltName << "[] = {"; diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp index 6ab58bd26a2..36d9e6697ce 100644 --- a/utils/TableGen/InstrInfoEmitter.cpp +++ b/utils/TableGen/InstrInfoEmitter.cpp @@ -569,9 +569,8 @@ void InstrInfoEmitter::run(raw_ostream &OS) { // Emit the array of instruction names. InstrNames.layout(); - OS << "extern const char " << TargetName << "InstrNameData[] = {\n"; - InstrNames.emit(OS, printChar); - OS << "};\n\n"; + InstrNames.emitStringLiteralDef(OS, Twine("extern const char ") + TargetName + + "InstrNameData[]"); OS << "extern const unsigned " << TargetName <<"InstrNameIndices[] = {"; Num = 0; diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp index 2586ec671b2..bf4ebca3365 100644 --- a/utils/TableGen/RegisterInfoEmitter.cpp +++ b/utils/TableGen/RegisterInfoEmitter.cpp @@ -992,9 +992,8 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, // Emit the string table. RegStrings.layout(); - OS << "extern const char " << TargetName << "RegStrings[] = {\n"; - RegStrings.emit(OS, printChar); - OS << "};\n\n"; + RegStrings.emitStringLiteralDef(OS, Twine("extern const char ") + TargetName + + "RegStrings[]"); OS << "extern const MCRegisterDesc " << TargetName << "RegDesc[] = { // Descriptors\n"; @@ -1065,9 +1064,8 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, OS << "} // end anonymous namespace\n\n"; RegClassStrings.layout(); - OS << "extern const char " << TargetName << "RegClassStrings[] = {\n"; - RegClassStrings.emit(OS, printChar); - OS << "};\n\n"; + RegClassStrings.emitStringLiteralDef( + OS, Twine("extern const char ") + TargetName + "RegClassStrings[]"); OS << "extern const MCRegisterClass " << TargetName << "MCRegisterClasses[] = {\n"; diff --git a/utils/TableGen/SequenceToOffsetTable.h b/utils/TableGen/SequenceToOffsetTable.h index 327da39f477..41cdefdb194 100644 --- a/utils/TableGen/SequenceToOffsetTable.h +++ b/utils/TableGen/SequenceToOffsetTable.h @@ -15,6 +15,7 @@ #ifndef LLVM_UTILS_TABLEGEN_SEQUENCETOOFFSETTABLE_H #define LLVM_UTILS_TABLEGEN_SEQUENCETOOFFSETTABLE_H +#include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -23,6 +24,61 @@ #include namespace llvm { +extern llvm::cl::opt EmitLongStrLiterals; + +// Helper function for SequenceToOffsetTable. +static inline void printStrLitEscChar(raw_ostream &OS, char C) { + const char *Escapes[] = { + "\\000", "\\001", "\\002", "\\003", "\\004", "\\005", "\\006", "\\007", + "\\010", "\\t", "\\n", "\\013", "\\014", "\\r", "\\016", "\\017", + "\\020", "\\021", "\\022", "\\023", "\\024", "\\025", "\\026", "\\027", + "\\030", "\\031", "\\032", "\\033", "\\034", "\\035", "\\036", "\\037", + " ", "!", "\\\"", "#", "$", "%", "&", "'", + "(", ")", "*", "+", ",", "-", ".", "/", + "0", "1", "2", "3", "4", "5", "6", "7", + "8", "9", ":", ";", "<", "=", ">", "?", + "@", "A", "B", "C", "D", "E", "F", "G", + "H", "I", "J", "K", "L", "M", "N", "O", + "P", "Q", "R", "S", "T", "U", "V", "W", + "X", "Y", "Z", "[", "\\\\", "]", "^", "_", + "`", "a", "b", "c", "d", "e", "f", "g", + "h", "i", "j", "k", "l", "m", "n", "o", + "p", "q", "r", "s", "t", "u", "v", "w", + "x", "y", "z", "{", "|", "}", "~", "\\177", + "\\200", "\\201", "\\202", "\\203", "\\204", "\\205", "\\206", "\\207", + "\\210", "\\211", "\\212", "\\213", "\\214", "\\215", "\\216", "\\217", + "\\220", "\\221", "\\222", "\\223", "\\224", "\\225", "\\226", "\\227", + "\\230", "\\231", "\\232", "\\233", "\\234", "\\235", "\\236", "\\237", + "\\240", "\\241", "\\242", "\\243", "\\244", "\\245", "\\246", "\\247", + "\\250", "\\251", "\\252", "\\253", "\\254", "\\255", "\\256", "\\257", + "\\260", "\\261", "\\262", "\\263", "\\264", "\\265", "\\266", "\\267", + "\\270", "\\271", "\\272", "\\273", "\\274", "\\275", "\\276", "\\277", + "\\300", "\\301", "\\302", "\\303", "\\304", "\\305", "\\306", "\\307", + "\\310", "\\311", "\\312", "\\313", "\\314", "\\315", "\\316", "\\317", + "\\320", "\\321", "\\322", "\\323", "\\324", "\\325", "\\326", "\\327", + "\\330", "\\331", "\\332", "\\333", "\\334", "\\335", "\\336", "\\337", + "\\340", "\\341", "\\342", "\\343", "\\344", "\\345", "\\346", "\\347", + "\\350", "\\351", "\\352", "\\353", "\\354", "\\355", "\\356", "\\357", + "\\360", "\\361", "\\362", "\\363", "\\364", "\\365", "\\366", "\\367", + "\\370", "\\371", "\\372", "\\373", "\\374", "\\375", "\\376", "\\377"}; + + static_assert(sizeof Escapes / sizeof Escapes[0] == + std::numeric_limits::max() + 1, + "unsupported character type"); + OS << Escapes[static_cast(C)]; +} + +static inline void printChar(raw_ostream &OS, char C) { + unsigned char UC(C); + if (isalnum(UC) || ispunct(UC)) { + OS << '\''; + if (C == '\\' || C == '\'') + OS << '\\'; + OS << C << '\''; + } else { + OS << unsigned(UC); + } +} /// SequenceToOffsetTable - Collect a number of terminated sequences of T. /// Compute the layout of a table that contains all the sequences, possibly by @@ -108,6 +164,37 @@ public: return I->second + (I->first.size() - Seq.size()); } + /// `emitStringLiteralDef` - Print out the table as the body of an array + /// initializer, where each element is a C string literal terminated by + /// `\0`. Falls back to emitting a comma-separated integer list if + /// `EmitLongStrLiterals` is false + void emitStringLiteralDef(raw_ostream &OS, const llvm::Twine &Decl) const { + assert(Entries && "Call layout() before emitStringLiteralDef()"); + if (EmitLongStrLiterals) { + OS << "\n#ifdef __GNUC__\n" + << "#pragma GCC diagnostic push\n" + << "#pragma GCC diagnostic ignored \"-Woverlength-strings\"\n" + << "#endif\n" + << Decl << " = {\n"; + } else { + OS << Decl << " = {\n"; + emit(OS, printChar, "0"); + OS << "\n};\n\n"; + return; + } + for (auto I : Seqs) { + OS << " /* " << I.second << " */ \""; + for (auto C : I.first) { + printStrLitEscChar(OS, C); + } + OS << "\\0\"\n"; + } + OS << "};\n" + << "#ifdef __GNUC__\n" + << "#pragma GCC diagnostic pop\n" + << "#endif\n\n"; + } + /// emit - Print out the table as the body of an array initializer. /// Use the Print function to print elements. void emit(raw_ostream &OS, @@ -127,19 +214,6 @@ public: } }; -// Helper function for SequenceToOffsetTable. -static inline void printChar(raw_ostream &OS, char C) { - unsigned char UC(C); - if (isalnum(UC) || ispunct(UC)) { - OS << '\''; - if (C == '\\' || C == '\'') - OS << '\\'; - OS << C << '\''; - } else { - OS << unsigned(UC); - } -} - } // end namespace llvm #endif diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp index bdb963c15d3..6da6599eb54 100644 --- a/utils/TableGen/TableGen.cpp +++ b/utils/TableGen/TableGen.cpp @@ -60,6 +60,12 @@ namespace llvm { /// Storage for TimeRegionsOpt as a global so that backends aren't required to /// include CommandLine.h bool TimeRegions = false; +cl::opt EmitLongStrLiterals( + "long-string-literals", + cl::desc("when emitting large string tables, prefer string literals over " + "comma-separated char literals. This can be a readability and " + "compile-time performance win, but upsets some compilers"), + cl::Hidden, cl::init(true)); } // end namespace llvm namespace {