hokein created this revision. hokein added a reviewer: sammccall. Herald added subscribers: mgrang, mgorny. Herald added a project: All. hokein requested review of this revision. Herald added a subscriber: alextsao1999. Herald added a project: clang-tools-extra.
Reductions need to be performed in a careful order in GLR parser, to make sure we gather all alternatives before creating an ambigous forest node. This patch encodes the nonterminal order into the rule id, so that we can efficiently to determinal ordering of reductions in GLR parser. This patch also abstracts to a TestGrammar, which is shared among tests. This is a part of the GLR parser, https://reviews.llvm.org/D121368, https://reviews.llvm.org/D121150 Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D122303 Files: clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h clang-tools-extra/pseudo/lib/GrammarBNF.cpp clang-tools-extra/pseudo/test/lr-build-basic.test clang-tools-extra/pseudo/test/lr-build-conflicts.test clang-tools-extra/pseudo/unittests/CMakeLists.txt clang-tools-extra/pseudo/unittests/GrammarTest.cpp clang-tools-extra/pseudo/unittests/TestGrammar.cpp clang-tools-extra/pseudo/unittests/TestGrammar.h
Index: clang-tools-extra/pseudo/unittests/TestGrammar.h =================================================================== --- /dev/null +++ clang-tools-extra/pseudo/unittests/TestGrammar.h @@ -0,0 +1,34 @@ +//===-- TestGrammar.h ------------------------------------------------*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides shared grammar-related test facilities among tests. +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/Grammar.h" + +namespace clang { +namespace pseudo { + +struct TestGrammar { + static TestGrammar build(llvm::StringRef BNF); + + // Returns the symbol id for the given name. + SymbolID symbol(llvm::StringRef Name) const; + + // Returns the rule id for the given nonterminal name. + // The nonterminal symbo is expected to have a single rule in the grammar. + RuleID singleRuleFor(llvm::StringRef NonterminalName) const; + + std::unique_ptr<Grammar> G; + std::vector<std::string> Diags; +}; + +} // namespace pseudo +} // namespace clang Index: clang-tools-extra/pseudo/unittests/TestGrammar.cpp =================================================================== --- /dev/null +++ clang-tools-extra/pseudo/unittests/TestGrammar.cpp @@ -0,0 +1,43 @@ +//===-- TestGrammar.cpp -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TestGrammar.h" +#include "llvm/Support/raw_ostream.h" + +namespace clang { +namespace pseudo { + +TestGrammar TestGrammar::build(llvm::StringRef BNF) { + TestGrammar T; + T.G = Grammar::parseBNF(BNF, T.Diags); + return T; +} + +SymbolID TestGrammar::symbol(llvm::StringRef Name) const { + for (unsigned I = 0; I < NumTerminals; ++I) + if (G->table().Terminals[I] == Name) + return tokenSymbol(static_cast<tok::TokenKind>(I)); + for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID) + if (G->table().Nonterminals[ID].Name == Name) + return ID; + llvm::errs() << "No such symbol found: " << Name; + std::abort(); +} + +RuleID TestGrammar::singleRuleFor(llvm::StringRef NonterminalName) const { + auto RuleRange = G->table().Nonterminals[symbol(NonterminalName)].RuleRange; + if (RuleRange.End - RuleRange.Start == 1) + return G->table().Nonterminals[symbol(NonterminalName)].RuleRange.Start; + llvm::errs() << "Expected a single rule for " << NonterminalName + << ", but it has " << RuleRange.End - RuleRange.Start + << " rule!\n"; + std::abort(); +} + +} // namespace pseudo +} // namespace clang Index: clang-tools-extra/pseudo/unittests/GrammarTest.cpp =================================================================== --- clang-tools-extra/pseudo/unittests/GrammarTest.cpp +++ clang-tools-extra/pseudo/unittests/GrammarTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang-pseudo/Grammar.h" +#include "TestGrammar.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include <memory> @@ -26,76 +27,78 @@ return testing::Property(&Rule::seq, ElementsAre(IDs...)); } -class GrammarTest : public ::testing::Test { -public: - void build(llvm::StringRef BNF) { - Diags.clear(); - G = Grammar::parseBNF(BNF, Diags); - } - - SymbolID id(llvm::StringRef Name) const { - for (unsigned I = 0; I < NumTerminals; ++I) - if (G->table().Terminals[I] == Name) - return tokenSymbol(static_cast<tok::TokenKind>(I)); - for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID) - if (G->table().Nonterminals[ID].Name == Name) - return ID; - ADD_FAILURE() << "No such symbol found: " << Name; - return 0; - } - -protected: - std::unique_ptr<Grammar> G; - std::vector<std::string> Diags; -}; - -TEST_F(GrammarTest, Basic) { - build("_ := IDENTIFIER + _ # comment"); - EXPECT_THAT(Diags, IsEmpty()); +TEST(GrammarTest, Basic) { + const auto &TG = TestGrammar::build("_ := IDENTIFIER + _ # comment"); + EXPECT_THAT(TG.Diags, IsEmpty()); auto ExpectedRule = - AllOf(TargetID(id("_")), Sequence(id("IDENTIFIER"), id("+"), id("_"))); - EXPECT_EQ(G->symbolName(id("_")), "_"); - EXPECT_THAT(G->rulesFor(id("_")), UnorderedElementsAre(ExpectedRule)); - const auto &Rule = G->lookupRule(/*RID=*/0); + AllOf(TargetID(TG.symbol("_")), + Sequence(TG.symbol("IDENTIFIER"), TG.symbol("+"), TG.symbol("_"))); + EXPECT_EQ(TG.G->symbolName(TG.symbol("_")), "_"); + EXPECT_THAT(TG.G->rulesFor(TG.symbol("_")), + UnorderedElementsAre(ExpectedRule)); + const auto &Rule = TG.G->lookupRule(/*RID=*/0); EXPECT_THAT(Rule, ExpectedRule); - EXPECT_THAT(G->symbolName(Rule.seq()[0]), "IDENTIFIER"); - EXPECT_THAT(G->symbolName(Rule.seq()[1]), "+"); - EXPECT_THAT(G->symbolName(Rule.seq()[2]), "_"); + EXPECT_THAT(TG.G->symbolName(Rule.seq()[0]), "IDENTIFIER"); + EXPECT_THAT(TG.G->symbolName(Rule.seq()[1]), "+"); + EXPECT_THAT(TG.G->symbolName(Rule.seq()[2]), "_"); } -TEST_F(GrammarTest, EliminatedOptional) { - build("_ := CONST_opt INT ;_opt"); - EXPECT_THAT(Diags, IsEmpty()); - EXPECT_THAT(G->table().Rules, - UnorderedElementsAre(Sequence(id("INT")), - Sequence(id("CONST"), id("INT")), - Sequence(id("CONST"), id("INT"), id(";")), - Sequence(id("INT"), id(";")))); +TEST(GrammarTest, RuleIDSorted) { + const auto &TG = TestGrammar::build(R"bnf( + _ := x + + x := y + y := z + z := IDENTIFIER + )bnf"); + EXPECT_THAT(TG.Diags, IsEmpty()); + + EXPECT_LT(TG.singleRuleFor("z"), TG.singleRuleFor("y")); + EXPECT_LT(TG.singleRuleFor("y"), TG.singleRuleFor("x")); + EXPECT_LT(TG.singleRuleFor("x"), TG.singleRuleFor("_")); +} + +TEST(GrammarTest, EliminatedOptional) { + const auto &TG = TestGrammar::build("_ := CONST_opt INT ;_opt"); + EXPECT_THAT(TG.Diags, IsEmpty()); + EXPECT_THAT( + TG.G->table().Rules, + UnorderedElementsAre( + Sequence(TG.symbol("INT")), + Sequence(TG.symbol("CONST"), TG.symbol("INT")), + Sequence(TG.symbol("CONST"), TG.symbol("INT"), TG.symbol(";")), + Sequence(TG.symbol("INT"), TG.symbol(";")))); } -TEST_F(GrammarTest, Diagnostics) { - build(R"cpp( +TEST(GrammarTest, Diagnostics) { + const auto &TG = TestGrammar::build(R"cpp( _ := ,_opt _ := undefined-sym null := _ := IDENFIFIE # a typo of the terminal IDENFITIER invalid + + # cycle + a := b + b := a )cpp"); - EXPECT_EQ(G->startSymbol(), id("_")); - EXPECT_THAT(Diags, UnorderedElementsAre( - "Rule '_ := ,_opt' has a nullable RHS", - "Rule 'null := ' has a nullable RHS", - "No rules for nonterminal: undefined-sym", - "Failed to parse 'invalid': no separator :=", - "Token-like name IDENFIFIE is used as a nonterminal", - "No rules for nonterminal: IDENFIFIE")); + EXPECT_EQ(TG.G->startSymbol(), TG.symbol("_")); + EXPECT_THAT( + TG.Diags, + UnorderedElementsAre("Rule '_ := ,_opt' has a nullable RHS", + "Rule 'null := ' has a nullable RHS", + "No rules for nonterminal: undefined-sym", + "Failed to parse 'invalid': no separator :=", + "Token-like name IDENFIFIE is used as a nonterminal", + "No rules for nonterminal: IDENFIFIE", + "The grammar is cyclic, see symbol a")); } -TEST_F(GrammarTest, FirstAndFollowSets) { - build( +TEST(GrammarTest, FirstAndFollowSets) { + auto TG = TestGrammar::build( R"bnf( _ := expr expr := expr - term @@ -103,7 +106,7 @@ term := IDENTIFIER term := ( expr ) )bnf"); - ASSERT_TRUE(Diags.empty()); + ASSERT_TRUE(TG.Diags.empty()); auto ToPairs = [](std::vector<llvm::DenseSet<SymbolID>> Input) { std::vector<std::pair<SymbolID, llvm::DenseSet<SymbolID>>> Sets; for (SymbolID ID = 0; ID < Input.size(); ++ID) @@ -112,19 +115,25 @@ }; EXPECT_THAT( - ToPairs(firstSets(*G)), + ToPairs(firstSets(*TG.G)), UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), - Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), - Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("("))))); - EXPECT_THAT( - ToPairs(followSets(*G)), - UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("EOF"))), - Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))), - Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))))); - - build(R"bnf( + Pair(TG.symbol("_"), + UnorderedElementsAre(TG.symbol("IDENTIFIER"), TG.symbol("("))), + Pair(TG.symbol("expr"), + UnorderedElementsAre(TG.symbol("IDENTIFIER"), TG.symbol("("))), + Pair(TG.symbol("term"), + UnorderedElementsAre(TG.symbol("IDENTIFIER"), TG.symbol("("))))); + EXPECT_THAT(ToPairs(followSets(*TG.G)), + UnorderedElementsAre( + Pair(TG.symbol("_"), UnorderedElementsAre(TG.symbol("EOF"))), + Pair(TG.symbol("expr"), + UnorderedElementsAre(TG.symbol("-"), TG.symbol("EOF"), + TG.symbol(")"))), + Pair(TG.symbol("term"), + UnorderedElementsAre(TG.symbol("-"), TG.symbol("EOF"), + TG.symbol(")"))))); + + TG = TestGrammar::build(R"bnf( # A simplfied C++ decl-specifier-seq. _ := decl-specifier-seq decl-specifier-seq := decl-specifier decl-specifier-seq @@ -133,25 +142,30 @@ decl-specifier := INLINE simple-type-specifier := INT )bnf"); - ASSERT_TRUE(Diags.empty()); + ASSERT_TRUE(TG.Diags.empty()); EXPECT_THAT( - ToPairs(firstSets(*G)), + ToPairs(firstSets(*TG.G)), UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))), - Pair(id("decl-specifier-seq"), - UnorderedElementsAre(id("INLINE"), id("INT"))), - Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))), - Pair(id("decl-specifier"), - UnorderedElementsAre(id("INLINE"), id("INT"))))); + Pair(TG.symbol("_"), + UnorderedElementsAre(TG.symbol("INLINE"), TG.symbol("INT"))), + Pair(TG.symbol("decl-specifier-seq"), + UnorderedElementsAre(TG.symbol("INLINE"), TG.symbol("INT"))), + Pair(TG.symbol("simple-type-specifier"), + UnorderedElementsAre(TG.symbol("INT"))), + Pair(TG.symbol("decl-specifier"), + UnorderedElementsAre(TG.symbol("INLINE"), TG.symbol("INT"))))); EXPECT_THAT( - ToPairs(followSets(*G)), + ToPairs(followSets(*TG.G)), UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("EOF"))), - Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))), - Pair(id("decl-specifier"), - UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))), - Pair(id("simple-type-specifier"), - UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))))); + Pair(TG.symbol("_"), UnorderedElementsAre(TG.symbol("EOF"))), + Pair(TG.symbol("decl-specifier-seq"), + UnorderedElementsAre(TG.symbol("EOF"))), + Pair(TG.symbol("decl-specifier"), + UnorderedElementsAre(TG.symbol("INLINE"), TG.symbol("INT"), + TG.symbol("EOF"))), + Pair(TG.symbol("simple-type-specifier"), + UnorderedElementsAre(TG.symbol("INLINE"), TG.symbol("INT"), + TG.symbol("EOF"))))); } } // namespace Index: clang-tools-extra/pseudo/unittests/CMakeLists.txt =================================================================== --- clang-tools-extra/pseudo/unittests/CMakeLists.txt +++ clang-tools-extra/pseudo/unittests/CMakeLists.txt @@ -7,6 +7,7 @@ DirectiveMapTest.cpp GrammarTest.cpp LRTableTest.cpp + TestGrammar.cpp TokenTest.cpp ) Index: clang-tools-extra/pseudo/test/lr-build-conflicts.test =================================================================== --- clang-tools-extra/pseudo/test/lr-build-conflicts.test +++ clang-tools-extra/pseudo/test/lr-build-conflicts.test @@ -5,8 +5,8 @@ # RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH # GRAPH: States # GRAPH-NEXT: State 0 -# GRAPH-NEXT: _ := ⢠expr # GRAPH-NEXT: expr := ⢠expr - expr +# GRAPH-NEXT: _ := ⢠expr # GRAPH-NEXT: expr := ⢠IDENTIFIER # GRAPH-NEXT: State 1 # GRAPH-NEXT: _ := expr ⢠@@ -42,6 +42,6 @@ # TABLE-NEXT: 'IDENTIFIER': shift state 2 # TABLE-NEXT: 'expr': go to state 4 # TABLE-NEXT: State 4 -# TABLE-NEXT: 'EOF': reduce by rule 2 'expr := expr - expr' +# TABLE-NEXT: 'EOF': reduce by rule 0 'expr := expr - expr' # TABLE-NEXT: '-': shift state 3 -# TABLE-NEXT: '-': reduce by rule 2 'expr := expr - expr' +# TABLE-NEXT: '-': reduce by rule 0 'expr := expr - expr' Index: clang-tools-extra/pseudo/test/lr-build-basic.test =================================================================== --- clang-tools-extra/pseudo/test/lr-build-basic.test +++ clang-tools-extra/pseudo/test/lr-build-basic.test @@ -26,4 +26,4 @@ # TABLE-NEXT: State 2 # TABLE-NEXT: 'EOF': reduce by rule 1 'expr := id' # TABLE-NEXT: State 3 -# TABLE-NEXT: 'EOF': reduce by rule 2 'id := IDENTIFIER' +# TABLE-NEXT: 'EOF': reduce by rule 0 'id := IDENTIFIER' Index: clang-tools-extra/pseudo/lib/GrammarBNF.cpp =================================================================== --- clang-tools-extra/pseudo/lib/GrammarBNF.cpp +++ clang-tools-extra/pseudo/lib/GrammarBNF.cpp @@ -9,6 +9,7 @@ #include "clang-pseudo/Grammar.h" #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/FormatVariadic.h" #include <memory> @@ -87,23 +88,77 @@ } assert(T->Rules.size() < (1 << RuleBits) && "Too many rules to fit in RuleID bits!"); - llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) { - // Sorted by the Target. - return std::tie(Left.Target, Left.Size) < - std::tie(Right.Target, Right.Size); - }); - RuleID RulePos = 0; + const auto &TopologicalOrder = getTopologicalOrder(T.get()); + llvm::stable_sort( + T->Rules, [&TopologicalOrder](const Rule &Left, const Rule &Right) { + // Sorted by the topological order of the nonterminal Target. + return TopologicalOrder[Left.Target] < TopologicalOrder[Right.Target]; + }); for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) { - RuleID Start = RulePos; - while (RulePos < T->Rules.size() && T->Rules[RulePos].Target == SID) - ++RulePos; - T->Nonterminals[SID].RuleRange = {Start, RulePos}; + auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) { + return TopologicalOrder[R.Target] < TopologicalOrder[SID]; + }); + RuleID Start = StartIt - T->Rules.begin(); + RuleID End = Start; + while (End < T->Rules.size() && + TopologicalOrder[T->Rules[End].Target] == TopologicalOrder[SID]) + ++End; + T->Nonterminals[SID].RuleRange = {Start, End}; } auto G = std::make_unique<Grammar>(std::move(T)); diagnoseGrammar(*G); return G; } + // Gets topological order for nonterminal symbols. + // + // The topological ordering is defined as: if a *single* nonterminal A produce + // (or transitively) a nonterminal B (that said, there is a production rule + // B := A in the grammar), then A comes before B. + // + // It returns a "lookup" vector where the index is SymbolID, and the value is + // the index of the SymbolID in the topological-orderred array. + std::vector<unsigned> getTopologicalOrder(GrammarTable *T) { + llvm::DenseMap<SymbolID, llvm::DenseSet<SymbolID>> DependencyGraph; + for (const auto &Rule : T->Rules) { + // if A := B, A depends on B. + if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0])) + DependencyGraph[Rule.Target].insert(Rule.Sequence[0]); + } + std::vector<SymbolID> Order; + // Each nonterminal state flows: NotVisited -> Visiting -> Visited. + enum State { + NotVisited, + Visiting, + Visited, + }; + std::vector<SymbolID> VisitStates(T->Nonterminals.size(), NotVisited); + std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void { + if (VisitStates[SID] == Visited) + return; + if (VisitStates[SID] == Visiting) { + Diagnostics.push_back( + llvm::formatv("The grammar is cyclic, see symbol {0}", + T->Nonterminals[SID].Name)); + return; + } + VisitStates[SID] = Visiting; + auto It = DependencyGraph.find(SID); + if (It != DependencyGraph.end()) { + for (SymbolID Dep : (It->getSecond())) + DFS(Dep); + } + VisitStates[SID] = Visited; + Order.push_back(SID); + }; + for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID) + DFS(ID); + std::vector<unsigned> Resuslt(T->Nonterminals.size(), 0); + for (size_t I = 0; I < Order.size(); ++I) + Resuslt[Order[I]] = I; + return Resuslt; + } + private: // Text representation of a BNF grammar rule. struct RuleSpec { Index: clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h =================================================================== --- clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h +++ clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h @@ -165,7 +165,8 @@ } RuleRange; }; - // The rules are sorted (and thus grouped) by target symbol. + // The rules are topological sorted (and thus grouped) by the target symbol. + // // RuleID is the index of the vector. std::vector<Rule> Rules; // A table of terminals (aka tokens). It corresponds to the clang::Token.
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits