//===- DFAEmitter.cpp - Finite state automaton emitter --------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This class can produce a generic deterministic finite state automaton (DFA), // given a set of possible states and transitions. // // The input transitions can be nondeterministic - this class will produce the // deterministic equivalent state machine. // // The generated code can run the DFA and produce an accepted / not accepted // state and also produce, given a sequence of transitions that results in an // accepted state, the sequence of intermediate states. This is useful if the // initial automaton was nondeterministic - it allows mapping back from the DFA // to the NFA. // //===----------------------------------------------------------------------===// #include "DFAEmitter.h" #include "CodeGenTarget.h" #include "SequenceToOffsetTable.h" #include "TableGenBackends.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/UniqueVector.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Record.h" #include "llvm/TableGen/TableGenBackend.h" #include #include #include #include #include #include #define DEBUG_TYPE "dfa-emitter" using namespace llvm; //===----------------------------------------------------------------------===// // DfaEmitter implementation. This is independent of the GenAutomaton backend. //===----------------------------------------------------------------------===// void DfaEmitter::addTransition(state_type From, state_type To, action_type A) { Actions.insert(A); NfaStates.insert(From); NfaStates.insert(To); NfaTransitions[{From, A}].push_back(To); ++NumNfaTransitions; } void DfaEmitter::visitDfaState(const DfaState &DS) { // For every possible action... auto FromId = DfaStates.idFor(DS); for (action_type A : Actions) { DfaState NewStates; DfaTransitionInfo TI; // For every represented state, word pair in the original NFA... for (state_type FromState : DS) { // If this action is possible from this state add the transitioned-to // states to NewStates. auto I = NfaTransitions.find({FromState, A}); if (I == NfaTransitions.end()) continue; for (state_type &ToState : I->second) { NewStates.push_back(ToState); TI.emplace_back(FromState, ToState); } } if (NewStates.empty()) continue; // Sort and unique. sort(NewStates); NewStates.erase(std::unique(NewStates.begin(), NewStates.end()), NewStates.end()); sort(TI); TI.erase(std::unique(TI.begin(), TI.end()), TI.end()); unsigned ToId = DfaStates.insert(NewStates); DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI)); } } void DfaEmitter::constructDfa() { DfaState Initial(1, /*NFA initial state=*/0); DfaStates.insert(Initial); // Note that UniqueVector starts indices at 1, not zero. unsigned DfaStateId = 1; while (DfaStateId <= DfaStates.size()) { DfaState S = DfaStates[DfaStateId]; visitDfaState(S); DfaStateId++; } } void DfaEmitter::emit(StringRef Name, raw_ostream &OS) { constructDfa(); OS << "// Input NFA has " << NfaStates.size() << " states with " << NumNfaTransitions << " transitions.\n"; OS << "// Generated DFA has " << DfaStates.size() << " states with " << DfaTransitions.size() << " transitions.\n\n"; // Implementation note: We don't bake a simple std::pair<> here as it requires // significantly more effort to parse. A simple test with a large array of // struct-pairs (N=100000) took clang-10 6s to parse. The same array of // std::pair took 242s. Instead we allow the user to // define the pair type. // // FIXME: It may make sense to emit these as ULEB sequences instead of // pairs of uint64_t. OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n"; OS << "// transition implies a set of NFA transitions. These are referred\n"; OS << "// to by index in " << Name << "Transitions[].\n"; SequenceToOffsetTable Table; std::map EmittedIndices; for (auto &T : DfaTransitions) Table.add(T.second.second); Table.layout(); OS << "const std::array " << Name << "TransitionInfo = {{\n"; Table.emit( OS, [](raw_ostream &OS, std::pair P) { OS << "{" << P.first << ", " << P.second << "}"; }, "{0ULL, 0ULL}"); OS << "}};\n\n"; OS << "// A transition in the generated " << Name << " DFA.\n"; OS << "struct " << Name << "Transition {\n"; OS << " unsigned FromDfaState; // The transitioned-from DFA state.\n"; OS << " "; printActionType(OS); OS << " Action; // The input symbol that causes this transition.\n"; OS << " unsigned ToDfaState; // The transitioned-to DFA state.\n"; OS << " unsigned InfoIdx; // Start index into " << Name << "TransitionInfo.\n"; OS << "};\n\n"; OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n"; OS << "// The initial state is 1, not zero.\n"; OS << "const std::array<" << Name << "Transition, " << DfaTransitions.size() << "> " << Name << "Transitions = {{\n"; for (auto &KV : DfaTransitions) { dfa_state_type From = KV.first.first; dfa_state_type To = KV.second.first; action_type A = KV.first.second; unsigned InfoIdx = Table.get(KV.second.second); OS << " {" << From << ", "; printActionValue(A, OS); OS << ", " << To << ", " << InfoIdx << "},\n"; } OS << "\n}};\n\n"; } void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; } void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; } //===----------------------------------------------------------------------===// // AutomatonEmitter implementation //===----------------------------------------------------------------------===// namespace { // FIXME: This entire discriminated union could be removed with c++17: // using Action = std::variant; struct Action { Record *R = nullptr; unsigned I = 0; std::string S; Action() = default; Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {} void print(raw_ostream &OS) const { if (R) OS << R->getName(); else if (!S.empty()) OS << '"' << S << '"'; else OS << I; } bool operator<(const Action &Other) const { return std::make_tuple(R, I, S) < std::make_tuple(Other.R, Other.I, Other.S); } }; using ActionTuple = std::vector; class Automaton; class Transition { uint64_t NewState; // The tuple of actions that causes this transition. ActionTuple Actions; // The types of the actions; this is the same across all transitions. SmallVector Types; public: Transition(Record *R, Automaton *Parent); const ActionTuple &getActions() { return Actions; } SmallVector getTypes() { return Types; } bool canTransitionFrom(uint64_t State); uint64_t transitionFrom(uint64_t State); }; class Automaton { RecordKeeper &Records; Record *R; std::vector Transitions; /// All possible action tuples, uniqued. UniqueVector Actions; /// The fields within each Transition object to find the action symbols. std::vector ActionSymbolFields; public: Automaton(RecordKeeper &Records, Record *R); void emit(raw_ostream &OS); ArrayRef getActionSymbolFields() { return ActionSymbolFields; } /// If the type of action A has been overridden (there exists a field /// "TypeOf_A") return that, otherwise return the empty string. StringRef getActionSymbolType(StringRef A); }; class AutomatonEmitter { RecordKeeper &Records; public: AutomatonEmitter(RecordKeeper &R) : Records(R) {} void run(raw_ostream &OS); }; /// A DfaEmitter implementation that can print our variant action type. class CustomDfaEmitter : public DfaEmitter { const UniqueVector &Actions; std::string TypeName; public: CustomDfaEmitter(const UniqueVector &Actions, StringRef TypeName) : Actions(Actions), TypeName(TypeName) {} void printActionType(raw_ostream &OS) override; void printActionValue(action_type A, raw_ostream &OS) override; }; } // namespace void AutomatonEmitter::run(raw_ostream &OS) { for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) { Automaton A(Records, R); OS << "#ifdef GET_" << R->getName() << "_DECL\n"; A.emit(OS); OS << "#endif // GET_" << R->getName() << "_DECL\n"; } } Automaton::Automaton(RecordKeeper &Records, Record *R) : Records(Records), R(R) { LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n"); ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields"); } void Automaton::emit(raw_ostream &OS) { StringRef TransitionClass = R->getValueAsString("TransitionClass"); for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) { assert(T->isSubClassOf("Transition")); Transitions.emplace_back(T, this); Actions.insert(Transitions.back().getActions()); } LLVM_DEBUG(dbgs() << " Action alphabet cardinality: " << Actions.size() << "\n"); LLVM_DEBUG(dbgs() << " Each state has " << Transitions.size() << " potential transitions.\n"); StringRef Name = R->getName(); CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action"); // Starting from the initial state, build up a list of possible states and // transitions. std::deque Worklist(1, 0); std::set SeenStates; unsigned NumTransitions = 0; SeenStates.insert(Worklist.front()); while (!Worklist.empty()) { uint64_t State = Worklist.front(); Worklist.pop_front(); for (Transition &T : Transitions) { if (!T.canTransitionFrom(State)) continue; uint64_t NewState = T.transitionFrom(State); if (SeenStates.emplace(NewState).second) Worklist.emplace_back(NewState); ++NumTransitions; Emitter.addTransition(State, NewState, Actions.idFor(T.getActions())); } } LLVM_DEBUG(dbgs() << " NFA automaton has " << SeenStates.size() << " states with " << NumTransitions << " transitions.\n"); const auto &ActionTypes = Transitions.back().getTypes(); OS << "// The type of an action in the " << Name << " automaton.\n"; if (ActionTypes.size() == 1) { OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n"; } else { OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ") << ">;\n"; } OS << "\n"; Emitter.emit(Name, OS); } StringRef Automaton::getActionSymbolType(StringRef A) { Twine Ty = "TypeOf_" + A; if (!R->getValue(Ty.str())) return ""; return R->getValueAsString(Ty.str()); } Transition::Transition(Record *R, Automaton *Parent) { BitsInit *NewStateInit = R->getValueAsBitsInit("NewState"); NewState = 0; assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 && "State cannot be represented in 64 bits!"); for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) { if (auto *Bit = dyn_cast(NewStateInit->getBit(I))) { if (Bit->getValue()) NewState |= 1ULL << I; } } for (StringRef A : Parent->getActionSymbolFields()) { RecordVal *SymbolV = R->getValue(A); if (auto *Ty = dyn_cast(SymbolV->getType())) { Actions.emplace_back(R->getValueAsDef(A), 0, ""); Types.emplace_back(Ty->getAsString()); } else if (isa(SymbolV->getType())) { Actions.emplace_back(nullptr, R->getValueAsInt(A), ""); Types.emplace_back("unsigned"); } else if (isa(SymbolV->getType())) { Actions.emplace_back(nullptr, 0, std::string(R->getValueAsString(A))); Types.emplace_back("std::string"); } else { report_fatal_error("Unhandled symbol type!"); } StringRef TypeOverride = Parent->getActionSymbolType(A); if (!TypeOverride.empty()) Types.back() = std::string(TypeOverride); } } bool Transition::canTransitionFrom(uint64_t State) { if ((State & NewState) == 0) // The bits we want to set are not set; return true; return false; } uint64_t Transition::transitionFrom(uint64_t State) { return State | NewState; } void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; } void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) { const ActionTuple &AT = Actions[A]; if (AT.size() > 1) OS << "std::make_tuple("; ListSeparator LS; for (const auto &SingleAction : AT) { OS << LS; SingleAction.print(OS); } if (AT.size() > 1) OS << ")"; } namespace llvm { void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) { AutomatonEmitter(RK).run(OS); } } // namespace llvm