This revision was landed with ongoing or failed builds. This revision was automatically updated to reflect the committed changes. hokein marked 2 inline comments as done. Closed by commit rG1a65c491be71: [pseudo] Support parsing variant target symbols. (authored by hokein).
Changed prior to commit: https://reviews.llvm.org/D125006?vs=429645&id=429647#toc Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D125006/new/ https://reviews.llvm.org/D125006 Files: clang-tools-extra/pseudo/benchmarks/Benchmark.cpp clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp clang-tools-extra/pseudo/include/clang-pseudo/GLR.h clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h clang-tools-extra/pseudo/lib/GLR.cpp clang-tools-extra/pseudo/lib/Grammar.cpp clang-tools-extra/pseudo/lib/LRGraph.cpp clang-tools-extra/pseudo/lib/LRTable.cpp clang-tools-extra/pseudo/lib/LRTableBuild.cpp clang-tools-extra/pseudo/lib/cxx.bnf clang-tools-extra/pseudo/test/glr-variant-start.cpp clang-tools-extra/pseudo/tool/ClangPseudo.cpp clang-tools-extra/pseudo/unittests/GLRTest.cpp clang-tools-extra/pseudo/unittests/GrammarTest.cpp
Index: clang-tools-extra/pseudo/unittests/GrammarTest.cpp =================================================================== --- clang-tools-extra/pseudo/unittests/GrammarTest.cpp +++ clang-tools-extra/pseudo/unittests/GrammarTest.cpp @@ -112,7 +112,7 @@ b := a )cpp"); - EXPECT_EQ(G->startSymbol(), id("_")); + EXPECT_EQ(G->underscore(), id("_")); EXPECT_THAT(Diags, UnorderedElementsAre( "Rule '_ := ,_opt' has a nullable RHS", "Rule 'null := ' has a nullable RHS", Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp =================================================================== --- clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -344,7 +344,8 @@ const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions); auto LRTable = LRTable::buildSLR(*G); - const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSStack}); + const ForestNode &Parsed = + glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test")); // Verify that there is no duplicated sequence node of `expr := IDENTIFIER` // in the forest, see the `#1` and `=#1` in the dump string. EXPECT_EQ(Parsed.dumpRecursive(*G), @@ -381,7 +382,8 @@ const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions); auto LRTable = LRTable::buildSLR(*G); - const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSStack}); + const ForestNode &Parsed = + glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test")); EXPECT_EQ(Parsed.dumpRecursive(*G), "[ 0, end) test := <ambiguous>\n" "[ 0, end) ââtest := IDENTIFIER\n" Index: clang-tools-extra/pseudo/tool/ClangPseudo.cpp =================================================================== --- clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -43,6 +43,9 @@ desc("Strip directives and select conditional sections")); static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics")); static opt<bool> PrintForest("print-forest", desc("Print parse forest")); +static opt<std::string> StartSymbol("start-symbol", + desc("specify the start symbol to parse"), + init("translation-unit")); static std::string readOrDie(llvm::StringRef Path) { llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text = @@ -110,9 +113,16 @@ if (ParseableStream) { clang::pseudo::ForestArena Arena; clang::pseudo::GSS GSS; - auto &Root = - glrParse(*ParseableStream, - clang::pseudo::ParseParams{*G, LRTable, Arena, GSS}); + llvm::Optional<clang::pseudo::SymbolID> StartSymID = + G->findNonterminal(StartSymbol); + if (!StartSymID) { + llvm::errs() << llvm::formatv( + "The start symbol {0} doesn't exit in the grammar!\n", Grammar); + return 2; + } + auto &Root = glrParse(*ParseableStream, + clang::pseudo::ParseParams{*G, LRTable, Arena, GSS}, + *StartSymID); if (PrintForest) llvm::outs() << Root.dumpRecursive(*G, /*Abbreviated=*/true); Index: clang-tools-extra/pseudo/test/glr-variant-start.cpp =================================================================== --- /dev/null +++ clang-tools-extra/pseudo/test/glr-variant-start.cpp @@ -0,0 +1,9 @@ +// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --start-symbol=statement-seq --print-forest | FileCheck %s + +a + a; +// CHECK: statement-seq~expression-statement := expression ; +// CHECK-NEXT: ââexpression~additive-expression := additive-expression + multiplicative-expression +// CHECK-NEXT: â ââadditive-expression~IDENTIFIER := +// CHECK-NEXT: â ââ+ := +// CHECK-NEXT: â ââmultiplicative-expression~IDENTIFIER := +// CHECK-NEXT: ââ; := Index: clang-tools-extra/pseudo/lib/cxx.bnf =================================================================== --- clang-tools-extra/pseudo/lib/cxx.bnf +++ clang-tools-extra/pseudo/lib/cxx.bnf @@ -23,10 +23,15 @@ # - optional symbols are supported, with a _opt suffix; # # [1] https://isocpp.org/files/papers/N4860.pdf + +# _ lists all the start-symbols which we support parsing. # -# -# _ serves as a "fake" start symbol, coming with real grammar symbols. +# We list important nonterminals as start symbols, rather than doing it for all +# nonterminals by default, this reduces the number of states by 30% and LRTable +# actions by 16%. _ := translation-unit +_ := statement-seq +_ := declaration-seq # gram.key typedef-name := IDENTIFIER Index: clang-tools-extra/pseudo/lib/LRTableBuild.cpp =================================================================== --- clang-tools-extra/pseudo/lib/LRTableBuild.cpp +++ clang-tools-extra/pseudo/lib/LRTableBuild.cpp @@ -40,6 +40,9 @@ class LRTable::Builder { public: + Builder(llvm::ArrayRef<std::pair<SymbolID, StateID>> StartStates) + : StartStates(StartStates) {} + bool insert(Entry E) { return Entries.insert(std::move(E)).second; } LRTable build(const GrammarTable >) && { // E.g. given the following parsing table with 3 states and 3 terminals: @@ -92,24 +95,26 @@ tokenSymbol(static_cast<tok::TokenKind>(Terminal))) ++SortedIndex; } + Table.StartStates = std::move(StartStates); return Table; } private: llvm::DenseSet<Entry> Entries; + std::vector<std::pair<SymbolID, StateID>> StartStates; }; LRTable LRTable::buildForTests(const GrammarTable >, llvm::ArrayRef<Entry> Entries) { - Builder Build; + Builder Build({}); for (const Entry &E : Entries) Build.insert(E); return std::move(Build).build(GT); } LRTable LRTable::buildSLR(const Grammar &G) { - Builder Build; auto Graph = LRGraph::buildLR0(G); + Builder Build(Graph.startStates()); for (const auto &T : Graph.edges()) { Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst); Build.insert({T.Src, T.Label, Act}); @@ -120,7 +125,7 @@ for (StateID SID = 0; SID < Graph.states().size(); ++SID) { for (const Item &I : Graph.states()[SID].Items) { // If we've just parsed the start symbol, we can accept the input. - if (G.lookupRule(I.rule()).Target == G.startSymbol() && !I.hasNext()) { + if (G.lookupRule(I.rule()).Target == G.underscore() && !I.hasNext()) { Build.insert({SID, tokenSymbol(tok::eof), Action::accept(I.rule())}); continue; } Index: clang-tools-extra/pseudo/lib/LRTable.cpp =================================================================== --- clang-tools-extra/pseudo/lib/LRTable.cpp +++ clang-tools-extra/pseudo/lib/LRTable.cpp @@ -120,5 +120,16 @@ /*length=*/End - Start); } +LRTable::StateID LRTable::getStartState(SymbolID Target) const { + assert(llvm::is_sorted(StartStates) && "StartStates must be sorted!"); + auto It = llvm::partition_point( + StartStates, [Target](const std::pair<SymbolID, StateID> &X) { + return X.first < Target; + }); + assert(It != StartStates.end() && It->first == Target && + "target symbol doesn't have a start state!"); + return It->second; +} + } // namespace pseudo } // namespace clang Index: clang-tools-extra/pseudo/lib/LRGraph.cpp =================================================================== --- clang-tools-extra/pseudo/lib/LRGraph.cpp +++ clang-tools-extra/pseudo/lib/LRGraph.cpp @@ -187,10 +187,17 @@ return States[ID]; } + void addStartState(SymbolID Sym, StateID State) { + StartStates.push_back({Sym, State}); + } + LRGraph build() && { States.shrink_to_fit(); Edges.shrink_to_fit(); - return LRGraph(std::move(States), std::move(Edges)); + llvm::sort(StartStates); + StartStates.shrink_to_fit(); + return LRGraph(std::move(States), std::move(Edges), + std::move(StartStates)); } private: @@ -199,16 +206,22 @@ std::vector<State> States; std::vector<Edge> Edges; const Grammar &G; + std::vector<std::pair<SymbolID, StateID>> StartStates; } Builder(G); std::vector<StateID> PendingStates; // Initialize states with the start symbol. - auto RRange = G.table().Nonterminals[G.startSymbol()].RuleRange; + auto RRange = G.table().Nonterminals[G.underscore()].RuleRange; for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) { auto StartState = std::vector<Item>{Item::start(RID, G)}; auto Result = Builder.insert(std::move(StartState)); assert(Result.second && "State must be new"); PendingStates.push_back(Result.first); + + const Rule &StartRule = G.lookupRule(RID); + assert(StartRule.Size == 1 && + "Start rule must have exactly one symbol in its body!"); + Builder.addStartState(StartRule.seq().front(), Result.first); } while (!PendingStates.empty()) { Index: clang-tools-extra/pseudo/lib/Grammar.cpp =================================================================== --- clang-tools-extra/pseudo/lib/Grammar.cpp +++ clang-tools-extra/pseudo/lib/Grammar.cpp @@ -24,13 +24,7 @@ } Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) { - // start symbol is named _, binary search it. - auto It = llvm::partition_point( - T->Nonterminals, - [](const GrammarTable::Nonterminal &X) { return X.Name < "_"; }); - assert(It != T->Nonterminals.end() && It->Name == "_" && - "symbol _ must exist in the grammar!"); - StartSymbol = It - T->Nonterminals.begin(); + Underscore = *findNonterminal("_"); } llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const { @@ -51,6 +45,15 @@ return T->Nonterminals[SID].Name; } +llvm::Optional<SymbolID> Grammar::findNonterminal(llvm::StringRef Name) const { + auto It = llvm::partition_point( + T->Nonterminals, + [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; }); + if (It != T->Nonterminals.end() && It->Name == Name) + return It - T->Nonterminals.begin(); + return llvm::None; +} + std::string Grammar::dumpRule(RuleID RID) const { std::string Result; llvm::raw_string_ostream OS(Result); @@ -132,8 +135,9 @@ // is completed at a fixed point where there is no more new symbols can be // added to any of the follow sets. // - // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol. - FollowSets[G.startSymbol()].insert(tokenSymbol(tok::eof)); + // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the + // augmented grammar, in our case it is '_'. + FollowSets[G.underscore()].insert(tokenSymbol(tok::eof)); bool Changed = true; while (Changed) { Changed = false; Index: clang-tools-extra/pseudo/lib/GLR.cpp =================================================================== --- clang-tools-extra/pseudo/lib/GLR.cpp +++ clang-tools-extra/pseudo/lib/GLR.cpp @@ -36,8 +36,8 @@ return OS; } -const ForestNode &glrParse(const TokenStream &Tokens, - const ParseParams &Params) { +const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params, + SymbolID StartSymbol) { llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Tokens); auto &G = Params.G; auto &GSS = Params.GSStack; @@ -61,9 +61,9 @@ } } }; - std::vector<const GSS::Node *> NewHeads = { - GSS.addNode(/*State=*/0, /*ForestNode*/ nullptr, {})}; + GSS.addNode(/*State=*/Params.Table.getStartState(StartSymbol), + /*ForestNode=*/nullptr, {})}; for (const ForestNode &Terminal : Terminals) { LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n", G.symbolName(Terminal.symbol()), @@ -101,11 +101,7 @@ return *PendingAccept.front().Head->Payload; } // We failed to parse the input, returning an opaque forest node for recovery. - auto RulesForStart = G.rulesFor(G.startSymbol()); - // FIXME: support multiple start symbols. - assert(RulesForStart.size() == 1 && RulesForStart.front().Size == 1 && - "start symbol _ must have exactly one rule"); - return Params.Forest.createOpaque(RulesForStart.front().Sequence[0], 0); + return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); } // Apply all pending shift actions. Index: clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h =================================================================== --- clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h +++ clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h @@ -132,6 +132,17 @@ // Returns empty if no available actions in the table. llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const; + // Returns the state from which the LR parser should start to parse the input + // tokens as the given StartSymbol. + // + // In LR parsing, the start state of `translation-unit` corresponds to + // `_ := ⢠translation-unit`. + // + // Each start state responds to **a** single grammar rule like `_ := start`. + // REQUIRE: The given StartSymbol must exist in the grammar (in a form of + // `_ := start`). + StateID getStartState(SymbolID StartSymbol) const; + size_t bytes() const { return sizeof(*this) + Actions.capacity() * sizeof(Action) + States.capacity() * sizeof(StateID) + @@ -171,6 +182,8 @@ std::vector<StateID> States; // A flat list of available actions, sorted by (SymbolID, State). std::vector<Action> Actions; + // A sorted table, storing the start state for each target parsing symbol. + std::vector<std::pair<SymbolID, StateID>> StartStates; }; llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &); Index: clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h =================================================================== --- clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h +++ clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h @@ -139,15 +139,21 @@ llvm::ArrayRef<State> states() const { return States; } llvm::ArrayRef<Edge> edges() const { return Edges; } + llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const { + return StartStates; + } std::string dumpForTests(const Grammar &) const; private: - LRGraph(std::vector<State> States, std::vector<Edge> Edges) - : States(std::move(States)), Edges(std::move(Edges)) {} + LRGraph(std::vector<State> States, std::vector<Edge> Edges, + std::vector<std::pair<SymbolID, StateID>> StartStates) + : States(std::move(States)), Edges(std::move(Edges)), + StartStates(std::move(StartStates)) {} std::vector<State> States; std::vector<Edge> Edges; + std::vector<std::pair<SymbolID, StateID>> StartStates; }; } // namespace pseudo Index: clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h =================================================================== --- clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h +++ clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h @@ -39,6 +39,7 @@ #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/StringRef.h" #include <cstdint> #include <vector> @@ -117,8 +118,8 @@ static std::unique_ptr<Grammar> parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diags); - // Returns the SymbolID of the start symbol '_'. - SymbolID startSymbol() const { return StartSymbol; }; + // Returns the SymbolID of the symbol '_'. + SymbolID underscore() const { return Underscore; }; // Returns all rules of the given nonterminal symbol. llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const; @@ -128,6 +129,9 @@ // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator). llvm::StringRef symbolName(SymbolID) const; + // Lookup the SymbolID of the nonterminal symbol by Name. + llvm::Optional<SymbolID> findNonterminal(llvm::StringRef Name) const; + // Dumps the whole grammar. std::string dump() const; // Dumps a particular rule. @@ -139,8 +143,9 @@ private: std::unique_ptr<GrammarTable> T; - // The start symbol '_' of the augmented grammar. - SymbolID StartSymbol; + // The symbol ID of '_'. (In the LR literature, this is the start symbol of + // the augmented grammar.) + SymbolID Underscore; }; // For each nonterminal X, computes the set of terminals that begin strings // derived from X. (Known as FIRST sets in grammar-based parsers). Index: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h =================================================================== --- clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -122,13 +122,14 @@ ForestArena &Forest; // Storage for the output forest. GSS &GSStack; // Storage for parsing stacks. }; -// Parse the given token stream with the GLR algorithm, and return a forest node -// of the start symbol. +// Parses the given token stream as the start symbol with the GLR algorithm, +// and returns a forest node of the start symbol. // -// If the parsing fails, we model it as an opaque node in the forest. +// A rule `_ := StartSymbol` must exit for the chosen start symbol. // -// FIXME: add support for variant start symbols. -const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params); +// If the parsing fails, we model it as an opaque node in the forest. +const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params, + SymbolID StartSymbol); // An active stack head can have multiple available actions (reduce/reduce // actions, reduce/shift actions). Index: clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp =================================================================== --- clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp +++ clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp @@ -58,8 +58,9 @@ clang::pseudo::ForestArena Arena; clang::pseudo::GSS GSS; - auto &Root = glrParse(ParseableStream, - clang::pseudo::ParseParams{*G, T, Arena, GSS}); + auto &Root = + glrParse(ParseableStream, clang::pseudo::ParseParams{*G, T, Arena, GSS}, + *G->findNonterminal("translation-unit")); if (Print) llvm::outs() << Root.dumpRecursive(*G); } Index: clang-tools-extra/pseudo/benchmarks/Benchmark.cpp =================================================================== --- clang-tools-extra/pseudo/benchmarks/Benchmark.cpp +++ clang-tools-extra/pseudo/benchmarks/Benchmark.cpp @@ -103,10 +103,11 @@ clang::LangOptions LangOpts = genericLangOpts(); LRTable Table = clang::pseudo::LRTable::buildSLR(*G); TokenStream ParseableStream = parseableTokenStream(); + SymbolID StartSymbol = *G->findNonterminal("translation-unit"); for (auto _ : State) { pseudo::ForestArena Forest; pseudo::GSS GSS; - glrParse(ParseableStream, ParseParams{*G, Table, Forest, GSS}); + glrParse(ParseableStream, ParseParams{*G, Table, Forest, GSS}, StartSymbol); } State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) * SourceText->size()); @@ -116,10 +117,12 @@ static void runParseOverall(benchmark::State &State) { clang::LangOptions LangOpts = genericLangOpts(); LRTable Table = clang::pseudo::LRTable::buildSLR(*G); + SymbolID StartSymbol = *G->findNonterminal("translation-unit"); for (auto _ : State) { pseudo::ForestArena Forest; pseudo::GSS GSS; - glrParse(parseableTokenStream(), ParseParams{*G, Table, Forest, GSS}); + glrParse(parseableTokenStream(), ParseParams{*G, Table, Forest, GSS}, + StartSymbol); } State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) * SourceText->size());
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits