hokein created this revision. hokein added a reviewer: sammccall. Herald added subscribers: mgrang, mgorny. hokein requested review of this revision. Herald added a project: clang.
LRGraph is the key component of the clang pseudo parser, it is a deterministic handle-finding finite-state machine, which is used to generated the LR parsing table. Separate from https://reviews.llvm.org/D118196. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D119172 Files: clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
Index: clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp =================================================================== --- /dev/null +++ clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp @@ -0,0 +1,84 @@ +//===--- LRGraphTest.cpp - LRGraph tests -------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/LRGraph.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include <memory> + +namespace clang { +namespace syntax { +namespace pseudo { +namespace { + +TEST(LRGraph, Build) { + struct TestCase { + llvm::StringRef BNF; + llvm::StringRef ExpectedStates; + }; + + TestCase Cases[] = {{ + R"bnf( +_ := expr +expr := IDENTIFIER + )bnf", + R"(States: +State 0 + _ := ⢠expr + expr := ⢠IDENTIFIER +State 1 + _ := expr ⢠+State 2 + expr := IDENTIFIER ⢠+0->[expr]1 +0->[IDENTIFIER]2 +)"}, + {// A grammar with a S/R conflict in SLR table: + // (id-id)-id, or id-(id-id). + R"bnf( +_ := expr +expr := expr - expr # S/R conflict at state 4 on '-' token +expr := IDENTIFIER + )bnf", + R"(States: +State 0 + _ := ⢠expr + expr := ⢠expr - expr + expr := ⢠IDENTIFIER +State 1 + _ := expr ⢠+ expr := expr ⢠- expr +State 2 + expr := IDENTIFIER ⢠+State 3 + expr := ⢠expr - expr + expr := expr - ⢠expr + expr := ⢠IDENTIFIER +State 4 + expr := expr - expr ⢠+ expr := expr ⢠- expr +0->[expr]1 +0->[IDENTIFIER]2 +1->[-]3 +3->[expr]4 +3->[IDENTIFIER]2 +4->[-]3 +)"}}; + for (const auto &C : Cases) { + std::vector<std::string> Diags; + auto G = Grammar::parseBNF(C.BNF, Diags); + ASSERT_THAT(Diags, testing::IsEmpty()); + auto LR0 = LRGraph::buildLR0(*G); + EXPECT_EQ(LR0.dumpForTests(*G), C.ExpectedStates); + } +} + +} // namespace +} // namespace pseudo +} // namespace syntax +} // namespace clang Index: clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt =================================================================== --- clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt +++ clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -4,6 +4,7 @@ add_clang_unittest(ClangPseudoTests GrammarTest.cpp + LRGraphTest.cpp ) clang_target_link_libraries(ClangPseudoTests Index: clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp =================================================================== --- /dev/null +++ clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp @@ -0,0 +1,202 @@ +//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/LRGraph.h" +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" +#include <tuple> + +namespace clang { +namespace syntax { +namespace pseudo { + +namespace { + +// Computes a closure of the given item set S: +// - extends the given S to contain all options for parsing next token; +// - nonterminals after a dot are recursively expanded into the begin-state +// of all production rules that produce that nonterminal; +// +// Given +// Grammar rules = [ _ := E, E := E-T, E := T, T := n, T := (E) ] +// Input = [ E := .T ] +// returns [ E := .T, T := .n, T := .(E) ] +State closure(ItemSet IS, const Grammar &G) { + llvm::DenseSet<Item> Closure = {IS.begin(), IS.end()}; + while (!IS.empty()) { + auto ProcessingItem = std::move(IS.back()); + IS.pop_back(); + + if (!ProcessingItem.hasNext()) + continue; + SymbolID NextSym = ProcessingItem.next(G); + + if (pseudo::isToken(NextSym)) + continue; + auto RRange = G.table().Nonterminals[NextSym].RuleRange; + for (RuleID RID = RRange.start; RID < RRange.end; ++RID) { + Item NewItem(RID, /*dot*/ 0, G); + if (Closure.insert(NewItem).second) // new + IS.push_back(std::move(NewItem)); + } + } + ItemSet Sorted = {Closure.begin(), Closure.end()}; + llvm::sort(Sorted, State::LessItem(G)); + return {std::move(Sorted)}; +} + +// Returns all next (with a dot advanced) kernel item sets, partitioned by the +// advanced symbol. +// +// Given +// S = [ E := .ab, E := E.-T ] +// returns [ +// {id(a), [ E := a.b ]}, +// {id(-), [ E := E-.T]} +// ] +std::vector<std::pair<SymbolID, ItemSet>> +nextAvailableKernelItems(const State &S, const Grammar &G) { + std::vector<std::pair<SymbolID, ItemSet>> Results; + llvm::ArrayRef<Item> AllItems = S.Items; + AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); }); + while (!AllItems.empty()) { + SymbolID AdvancedSymbol = AllItems.front().next(G); + auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) { + assert(I.hasNext()); + return I.next(G) == AdvancedSymbol; + }); + assert(!Batch.empty()); + AllItems = AllItems.drop_front(Batch.size()); + + // Advance a dot over the Symbol. + ItemSet Next; + for (const Item &I : Batch) + Next.push_back(Item(I.rule(), I.dot() + 1, G)); + // sort the set to keep order determinism for hash computation. + llvm::sort(Next); + Results.push_back({AdvancedSymbol, std::move(Next)}); + } + return Results; +} + +} // namespace + +std::string Item::dump(const Grammar &G) const { + const auto &Rule = G.lookupRule(RID); + auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) { + std::vector<llvm::StringRef> Results; + for (auto SID : Syms) + Results.push_back(G.symbolName(SID)); + return Results; + }; + return llvm::formatv("{0} := {1} ⢠{2}\n", G.symbolName(Rule.Target), + llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "), + llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " ")) + .str(); +} + +std::string State::dump(const Grammar &G) const { + std::string Result; + llvm::raw_string_ostream OS(Result); + for (const auto &Item : Items) + OS << llvm::formatv(" {0}", Item.dump(G)); + return OS.str(); +} + +std::string LRGraph::dumpForTests(const Grammar &G) const { + std::string Result; + llvm::raw_string_ostream OS(Result); + + OS << "States:\n"; + for (StateID ID = 0; ID < States.size(); ++ID) { + OS << llvm::formatv("State {0}\n", ID); + OS << States[ID].dump(G); + } + for (const auto &E : Edges) { + OS << llvm::formatv("{0}->[{1}]{2}\n", E.Src, G.symbolName(E.Label), E.Dst); + } + return OS.str(); +} + +LRGraph LRGraph::buildLR0(const Grammar &G) { + class Builder { + public: + Builder(const Grammar &G) : G(G) {} + + // Adds a given state if not existed. + std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) { + assert(llvm::is_sorted(KernelItems) && + "Item must be sorted before calculating its hash value!"); + llvm::hash_code Hash = + llvm::hash_combine_range(KernelItems.begin(), KernelItems.end()); + auto It = StatesIndex.find(Hash); + if (It != StatesIndex.end()) + return {It->second, false}; + States.push_back(closure(std::move(KernelItems), G)); + StateID NextStateID = States.size() - 1; + StatesIndex.insert({Hash, NextStateID}); + return {NextStateID, true}; + } + + void insertEdge(StateID Src, StateID Dst, SymbolID Label) { + Edges.push_back({Src, Dst, Label}); + } + + // Returns a state with the given id. + const State &find(StateID ID) const { + assert(ID < States.size()); + return States[ID]; + } + + LRGraph build() && { + States.resize(States.size()); + Edges.resize(Edges.size()); + return LRGraph(std::move(States), std::move(Edges)); + } + + private: + // We store the hash codes, as collisions is rare and ItemSet is heavy. + // key is the hash value of **kernel** item sets. + llvm::DenseMap<llvm::hash_code, /*index of States*/ size_t> StatesIndex; + std::vector<State> States; + std::vector<Edge> Edges; + const Grammar &G; + } Builder(G); + + std::vector<StateID> PendingStates; + // Initialize states with the start symbol. + auto RRange = G.table().Nonterminals[G.startSymbol()].RuleRange; + for (RuleID RID = RRange.start; RID < RRange.end; ++RID) { + auto StartState = std::vector<Item>{Item(RID, 0, G)}; + auto Result = Builder.insert(std::move(StartState)); + assert(Result.second && "State must be new"); + PendingStates.push_back(Result.first); + } + + while (!PendingStates.empty()) { + auto CurrentStateID = PendingStates.back(); + PendingStates.pop_back(); + for (auto Next : + nextAvailableKernelItems(Builder.find(CurrentStateID), G)) { + auto Insert = Builder.insert(Next.second); + if (Insert.second) // new state, insert to the pending queue. + PendingStates.push_back(Insert.first); + Builder.insertEdge(CurrentStateID, Insert.first, Next.first); + } + } + return std::move(Builder).build(); +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang Index: clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt =================================================================== --- clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt +++ clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -3,6 +3,7 @@ add_clang_library(clangToolingSyntaxPseudo Grammar.cpp GrammarBNF.cpp + LRGraph.cpp LINK_LIBS clangBasic Index: clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h =================================================================== --- /dev/null +++ clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h @@ -0,0 +1,180 @@ +//===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// LR parsers are bottom-up parsers -- they scan the input from left to right, +// and find the right-hand side of a production rule (called handle), and +// replace the handle with the nonterminal defined by the production rule. +// +// This file defines LRGraph, a deterministic handle-finding finite-state +// automaton, which is a key component in LR parsers to recognize any of +// handles in the grammar efficiently. We build the LR table (ACTION and GOTO +// Table) based on the LRGraph. +// +// LRGraph can be constructed for any context-free grammars. +// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but +// interpretation of the FSA is nondeterminsitic -- we might in a state where +// we can continue searching an handle and identify a handle (called +// shift/reduce conflicts), or identify more than one handle (callled +// reduce/reduce conflicts). +// +// LRGraph is a common model for all variants of LR automatons, from the most +// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead +// in making decisions. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H +#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H + +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "llvm/ADT/Hashing.h" +#include <vector> + +namespace clang { +namespace syntax { +namespace pseudo { + +// An LR item -- a grammar rule with a dot at some position of the body. +// e.g. a production rule A := xy yields 3 items: +// A := .xy +// A := x.y +// A := xy. +// An item indicates how much of a production rule has been recognized at a +// position (described by dot), for example, A := x.y indicates that we have +// recognized the x part from the input, and we hope next to see the input +// derivable from y. +class Item { +public: + Item(RuleID ID, uint8_t DotPos, uint8_t RuleLength) + : RID(ID), DotPos(DotPos), RuleLength(RuleLength) {} + Item(RuleID ID, uint8_t DotPos, const Grammar &G) + : RID(ID), DotPos(DotPos), RuleLength(G.lookupRule(ID).Size) { + assert(DotPos <= RuleLength); + } + + RuleID rule() const { return RID; } + uint8_t dot() const { return DotPos; } + + bool hasNext() const { return DotPos < RuleLength; } + SymbolID next(const Grammar &G) const { + assert(hasNext()); + return G.lookupRule(RID).Sequence[DotPos]; + } + + std::string dump(const Grammar &G) const; + + bool operator==(const Item &I) const { + return DotPos == I.DotPos && RID == I.RID; + } + bool operator<(const Item &I) const { + return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos); + } + friend llvm::hash_code hash_value(const Item &I) { + return llvm::hash_combine(I.RID, I.DotPos); + } + +private: + RuleID RID = 0; + uint8_t DotPos = 0; + uint8_t RuleLength = 0; // the length of rule body. +}; +using ItemSet = std::vector<Item>; + +// A state represents a node in the LR automaton graph. It is an item set, which +// contains all possible rules that the LR parser may be parsing in that state. +// +// Conceptually, If we knew in advance what we're parsing, at any point we're +// partway through parsing a production, sitting on a stack of partially parsed +// productions. But because we don't know, there could be *several* productions +// we're partway through. The set of possibilities is the parser state, and we +// precompute all the transitions between these states. +struct State { + // A full set of items (including non-kernel items) representing the state, + // sorted by LessItem. + ItemSet Items; + + std::string dump(const Grammar &G) const; + struct LessItem { + LessItem(const Grammar &G) : G(G) {} + bool operator()(const Item &L, const Item &R) { + if (L.hasNext() && R.hasNext()) { + if (L.next(G) == R.next(G)) + return L < R; + return L.next(G) < R.next(G); + } + // Item with a trailing dot is considered minimal. + return L.hasNext() < R.hasNext(); + } + const Grammar &G; + }; +}; + +// LRGraph is a deterministic finite state automaton for LR parsing. +// +// Intuitively, an LR automaton is a transition graph. The graph has a +// collection of nodes, called States. Each state corresponds to a particular +// item set, which represents a condition that could occur duing the process of +// parsing a production. Edges are directed from one state to another. Each edge +// is labeled by a grammar symbol (terminal or nonterminal). +// +// LRGraph is used to construct the LR parsing table which is a core +// data-structure driving the LR parser. +class LRGraph { +public: + // StateID is the index in States table. + using StateID = uint16_t; + + // Constructs an LR(0) automaton. + static LRGraph buildLR0(const Grammar &); + + // An edge in the LR graph, it represents a transition in the LR automaton. + // If the parser is at state Src, with a lookahead Label, then it + // transits to state Dst. + struct Edge { + StateID Src, Dst; + SymbolID Label; + }; + + llvm::ArrayRef<State> states() const { return States; } + llvm::ArrayRef<Edge> edges() const { return Edges; } + + std::string dumpForTests(const Grammar &) const; + +private: + LRGraph(std::vector<State> States, std::vector<Edge> Edges) + : States(std::move(States)), Edges(std::move(Edges)) {} + + std::vector<State> States; + std::vector<Edge> Edges; +}; + +} // namespace pseudo +} // namespace syntax +} // namespace clang + +namespace llvm { +// Support clang::syntax::pseudo::Item as DenseMap keys. +template <> struct DenseMapInfo<clang::syntax::pseudo::Item> { + static inline clang::syntax::pseudo::Item getEmptyKey() { + static clang::syntax::pseudo::Item EmptyKey(-1, 0, 0); + return EmptyKey; + } + static inline clang::syntax::pseudo::Item getTombstoneKey() { + static clang::syntax::pseudo::Item TombstoneKey(-2, 0, 0); + return TombstoneKey; + } + static unsigned getHashValue(const clang::syntax::pseudo::Item &I) { + return hash_value(I); + } + static bool isEqual(const clang::syntax::pseudo::Item &LHS, + const clang::syntax::pseudo::Item &RHS) { + return LHS == RHS; + } +}; +} // namespace llvm + +#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits