[PATCH] D119172: [pseudo] Implement LRGraph

Haojian Wu via Phabricator via cfe-commits Mon, 07 Feb 2022 11:37:03 -0800

hokein created this revision.
hokein added a reviewer: sammccall.
Herald added subscribers: mgrang, mgorny.
hokein requested review of this revision.
Herald added a project: clang.


LRGraph is the key component of the clang pseudo parser, it is a
deterministic handle-finding finite-state machine, which is used to
generated the LR parsing table.

Separate from https://reviews.llvm.org/D118196.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D119172

Files:
  clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h
  clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
  clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp
  clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
  clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp

Index: clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
===================================================================
--- /dev/null
+++ clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
@@ -0,0 +1,84 @@
+//===--- LRGraphTest.cpp - LRGraph tests -------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+TEST(LRGraph, Build) {
+  struct TestCase {
+    llvm::StringRef BNF;
+    llvm::StringRef ExpectedStates;
+  };
+
+  TestCase Cases[] = {{
+                          R"bnf(
+_ := expr
+expr := IDENTIFIER
+      )bnf",
+                          R"(States:
+State 0
+    _ :=  â¢ expr
+    expr :=  â¢ IDENTIFIER
+State 1
+    _ := expr â¢ 
+State 2
+    expr := IDENTIFIER â¢ 
+0->[expr]1
+0->[IDENTIFIER]2
+)"},
+                      {// A grammar with a S/R conflict in SLR table:
+                       // (id-id)-id, or id-(id-id).
+                       R"bnf(
+_ := expr
+expr := expr - expr  # S/R conflict at state 4 on '-' token
+expr := IDENTIFIER
+      )bnf",
+                       R"(States:
+State 0
+    _ :=  â¢ expr
+    expr :=  â¢ expr - expr
+    expr :=  â¢ IDENTIFIER
+State 1
+    _ := expr â¢ 
+    expr := expr â¢ - expr
+State 2
+    expr := IDENTIFIER â¢ 
+State 3
+    expr :=  â¢ expr - expr
+    expr := expr - â¢ expr
+    expr :=  â¢ IDENTIFIER
+State 4
+    expr := expr - expr â¢ 
+    expr := expr â¢ - expr
+0->[expr]1
+0->[IDENTIFIER]2
+1->[-]3
+3->[expr]4
+3->[IDENTIFIER]2
+4->[-]3
+)"}};
+  for (const auto &C : Cases) {
+    std::vector<std::string> Diags;
+    auto G = Grammar::parseBNF(C.BNF, Diags);
+    ASSERT_THAT(Diags, testing::IsEmpty());
+    auto LR0 = LRGraph::buildLR0(*G);
+    EXPECT_EQ(LR0.dumpForTests(*G), C.ExpectedStates);
+  }
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
===================================================================
--- clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -4,6 +4,7 @@
 
 add_clang_unittest(ClangPseudoTests
   GrammarTest.cpp
+  LRGraphTest.cpp
 )
 
 clang_target_link_libraries(ClangPseudoTests
Index: clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp
@@ -0,0 +1,202 @@
+//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include <tuple>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+namespace {
+
+// Computes a closure of the given item set S:
+//  - extends the given S to contain all options for parsing next token;
+//  - nonterminals after a dot are recursively expanded into the begin-state
+//    of all production rules that produce that nonterminal;
+//
+// Given
+//   Grammar rules = [ _ := E, E := E-T, E := T, T := n, T := (E) ]
+//   Input = [ E := .T ]
+// returns [ E := .T, T := .n, T := .(E) ]
+State closure(ItemSet IS, const Grammar &G) {
+  llvm::DenseSet<Item> Closure = {IS.begin(), IS.end()};
+  while (!IS.empty()) {
+    auto ProcessingItem = std::move(IS.back());
+    IS.pop_back();
+
+    if (!ProcessingItem.hasNext())
+      continue;
+    SymbolID NextSym = ProcessingItem.next(G);
+
+    if (pseudo::isToken(NextSym))
+      continue;
+    auto RRange = G.table().Nonterminals[NextSym].RuleRange;
+    for (RuleID RID = RRange.start; RID < RRange.end; ++RID) {
+      Item NewItem(RID, /*dot*/ 0, G);
+      if (Closure.insert(NewItem).second) // new
+        IS.push_back(std::move(NewItem));
+    }
+  }
+  ItemSet Sorted = {Closure.begin(), Closure.end()};
+  llvm::sort(Sorted, State::LessItem(G));
+  return {std::move(Sorted)};
+}
+
+// Returns all next (with a dot advanced) kernel item sets, partitioned by the
+// advanced symbol.
+//
+// Given
+//  S = [ E := .ab, E := E.-T ]
+// returns [
+//   {id(a), [ E := a.b ]},
+//   {id(-), [ E := E-.T]}
+// ]
+std::vector<std::pair<SymbolID, ItemSet>>
+nextAvailableKernelItems(const State &S, const Grammar &G) {
+  std::vector<std::pair<SymbolID, ItemSet>> Results;
+  llvm::ArrayRef<Item> AllItems = S.Items;
+  AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); });
+  while (!AllItems.empty()) {
+    SymbolID AdvancedSymbol = AllItems.front().next(G);
+    auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) {
+      assert(I.hasNext());
+      return I.next(G) == AdvancedSymbol;
+    });
+    assert(!Batch.empty());
+    AllItems = AllItems.drop_front(Batch.size());
+
+    // Advance a dot over the Symbol.
+    ItemSet Next;
+    for (const Item &I : Batch)
+      Next.push_back(Item(I.rule(), I.dot() + 1, G));
+    // sort the set to keep order determinism for hash computation.
+    llvm::sort(Next);
+    Results.push_back({AdvancedSymbol, std::move(Next)});
+  }
+  return Results;
+}
+
+} // namespace
+
+std::string Item::dump(const Grammar &G) const {
+  const auto &Rule = G.lookupRule(RID);
+  auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) {
+    std::vector<llvm::StringRef> Results;
+    for (auto SID : Syms)
+      Results.push_back(G.symbolName(SID));
+    return Results;
+  };
+  return llvm::formatv("{0} := {1} â¢ {2}\n", G.symbolName(Rule.Target),
+                       llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
+                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
+      .str();
+}
+
+std::string State::dump(const Grammar &G) const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  for (const auto &Item : Items)
+    OS << llvm::formatv("    {0}", Item.dump(G));
+  return OS.str();
+}
+
+std::string LRGraph::dumpForTests(const Grammar &G) const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+
+  OS << "States:\n";
+  for (StateID ID = 0; ID < States.size(); ++ID) {
+    OS << llvm::formatv("State {0}\n", ID);
+    OS << States[ID].dump(G);
+  }
+  for (const auto &E : Edges) {
+    OS << llvm::formatv("{0}->[{1}]{2}\n", E.Src, G.symbolName(E.Label), E.Dst);
+  }
+  return OS.str();
+}
+
+LRGraph LRGraph::buildLR0(const Grammar &G) {
+  class Builder {
+  public:
+    Builder(const Grammar &G) : G(G) {}
+
+    // Adds a given state if not existed.
+    std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) {
+      assert(llvm::is_sorted(KernelItems) &&
+             "Item must be sorted before calculating its hash value!");
+      llvm::hash_code Hash =
+          llvm::hash_combine_range(KernelItems.begin(), KernelItems.end());
+      auto It = StatesIndex.find(Hash);
+      if (It != StatesIndex.end())
+        return {It->second, false};
+      States.push_back(closure(std::move(KernelItems), G));
+      StateID NextStateID = States.size() - 1;
+      StatesIndex.insert({Hash, NextStateID});
+      return {NextStateID, true};
+    }
+
+    void insertEdge(StateID Src, StateID Dst, SymbolID Label) {
+      Edges.push_back({Src, Dst, Label});
+    }
+
+    // Returns a state with the given id.
+    const State &find(StateID ID) const {
+      assert(ID < States.size());
+      return States[ID];
+    }
+
+    LRGraph build() && {
+      States.resize(States.size());
+      Edges.resize(Edges.size());
+      return LRGraph(std::move(States), std::move(Edges));
+    }
+
+  private:
+    // We store the hash codes, as collisions is rare and ItemSet is heavy.
+    // key is the hash value of **kernel** item sets.
+    llvm::DenseMap<llvm::hash_code, /*index of States*/ size_t> StatesIndex;
+    std::vector<State> States;
+    std::vector<Edge> Edges;
+    const Grammar &G;
+  } Builder(G);
+
+  std::vector<StateID> PendingStates;
+  // Initialize states with the start symbol.
+  auto RRange = G.table().Nonterminals[G.startSymbol()].RuleRange;
+  for (RuleID RID = RRange.start; RID < RRange.end; ++RID) {
+    auto StartState = std::vector<Item>{Item(RID, 0, G)};
+    auto Result = Builder.insert(std::move(StartState));
+    assert(Result.second && "State must be new");
+    PendingStates.push_back(Result.first);
+  }
+
+  while (!PendingStates.empty()) {
+    auto CurrentStateID = PendingStates.back();
+    PendingStates.pop_back();
+    for (auto Next :
+         nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
+      auto Insert = Builder.insert(Next.second);
+      if (Insert.second) // new state, insert to the pending queue.
+        PendingStates.push_back(Insert.first);
+      Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
+    }
+  }
+  return std::move(Builder).build();
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
===================================================================
--- clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -3,6 +3,7 @@
 add_clang_library(clangToolingSyntaxPseudo
   Grammar.cpp
   GrammarBNF.cpp
+  LRGraph.cpp
   
   LINK_LIBS
   clangBasic
Index: clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h
@@ -0,0 +1,180 @@
+//===--- LRGraph.h - Build an LR automaton  ------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  LR parsers are bottom-up parsers -- they scan the input from left to right,
+//  and find the right-hand side of a production rule (called handle), and
+//  replace the handle with the nonterminal defined by the production rule.
+//
+//  This file defines LRGraph, a deterministic handle-finding finite-state
+//  automaton, which is a key component in LR parsers to recognize any of
+//  handles in the grammar efficiently. We build the LR table (ACTION and GOTO
+//  Table) based on the LRGraph.
+//
+//  LRGraph can be constructed for any context-free grammars.
+//  Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
+//  interpretation of the FSA is nondeterminsitic -- we might in a state where
+//  we can continue searching an handle and identify a handle (called
+//  shift/reduce conflicts), or identify more than one handle (callled
+//  reduce/reduce conflicts).
+//
+//  LRGraph is a common model for all variants of LR automatons, from the most
+//  basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
+//  in making decisions.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/Hashing.h"
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+// An LR item -- a grammar rule with a dot at some position of the body.
+// e.g. a production rule A := xy yields 3 items:
+//   A := .xy
+//   A := x.y
+//   A := xy.
+// An item indicates how much of a production rule has been recognized at a
+// position (described by dot), for example, A := x.y indicates that we have
+// recognized the x part from the input, and we hope next to see the input
+// derivable from y.
+class Item {
+public:
+  Item(RuleID ID, uint8_t DotPos, uint8_t RuleLength)
+      : RID(ID), DotPos(DotPos), RuleLength(RuleLength) {}
+  Item(RuleID ID, uint8_t DotPos, const Grammar &G)
+      : RID(ID), DotPos(DotPos), RuleLength(G.lookupRule(ID).Size) {
+    assert(DotPos <= RuleLength);
+  }
+
+  RuleID rule() const { return RID; }
+  uint8_t dot() const { return DotPos; }
+
+  bool hasNext() const { return DotPos < RuleLength; }
+  SymbolID next(const Grammar &G) const {
+    assert(hasNext());
+    return G.lookupRule(RID).Sequence[DotPos];
+  }
+
+  std::string dump(const Grammar &G) const;
+
+  bool operator==(const Item &I) const {
+    return DotPos == I.DotPos && RID == I.RID;
+  }
+  bool operator<(const Item &I) const {
+    return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
+  }
+  friend llvm::hash_code hash_value(const Item &I) {
+    return llvm::hash_combine(I.RID, I.DotPos);
+  }
+
+private:
+  RuleID RID = 0;
+  uint8_t DotPos = 0;
+  uint8_t RuleLength = 0; // the length of rule body.
+};
+using ItemSet = std::vector<Item>;
+
+// A state represents a node in the LR automaton graph. It is an item set, which
+// contains all possible rules that the LR parser may be parsing in that state.
+//
+// Conceptually, If we knew in advance what we're parsing, at any point we're
+// partway through parsing a production, sitting on a stack of partially parsed
+// productions. But because we don't know, there could be *several* productions
+// we're partway through. The set of possibilities is the parser state, and we
+// precompute all the transitions between these states.
+struct State {
+  // A full set of items (including non-kernel items) representing the state,
+  // sorted by LessItem.
+  ItemSet Items;
+
+  std::string dump(const Grammar &G) const;
+  struct LessItem {
+    LessItem(const Grammar &G) : G(G) {}
+    bool operator()(const Item &L, const Item &R) {
+      if (L.hasNext() && R.hasNext()) {
+        if (L.next(G) == R.next(G))
+          return L < R;
+        return L.next(G) < R.next(G);
+      }
+      // Item with a trailing dot is considered minimal.
+      return L.hasNext() < R.hasNext();
+    }
+    const Grammar &G;
+  };
+};
+
+// LRGraph is a deterministic finite state automaton for LR parsing.
+//
+// Intuitively, an LR automaton is a transition graph. The graph has a
+// collection of nodes, called States. Each state corresponds to a particular
+// item set, which represents a condition that could occur duing the process of
+// parsing a production. Edges are directed from one state to another. Each edge
+// is labeled by a grammar symbol (terminal or nonterminal).
+//
+// LRGraph is used to construct the LR parsing table which is a core
+// data-structure driving the LR parser.
+class LRGraph {
+public:
+  // StateID is the index in States table.
+  using StateID = uint16_t;
+
+  // Constructs an LR(0) automaton.
+  static LRGraph buildLR0(const Grammar &);
+
+  // An edge in the LR graph, it represents a transition in the LR automaton.
+  // If the parser is at state Src, with a lookahead Label, then it
+  // transits to state Dst.
+  struct Edge {
+    StateID Src, Dst;
+    SymbolID Label;
+  };
+
+  llvm::ArrayRef<State> states() const { return States; }
+  llvm::ArrayRef<Edge> edges() const { return Edges; }
+
+  std::string dumpForTests(const Grammar &) const;
+
+private:
+  LRGraph(std::vector<State> States, std::vector<Edge> Edges)
+      : States(std::move(States)), Edges(std::move(Edges)) {}
+
+  std::vector<State> States;
+  std::vector<Edge> Edges;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+namespace llvm {
+// Support clang::syntax::pseudo::Item as DenseMap keys.
+template <> struct DenseMapInfo<clang::syntax::pseudo::Item> {
+  static inline clang::syntax::pseudo::Item getEmptyKey() {
+    static clang::syntax::pseudo::Item EmptyKey(-1, 0, 0);
+    return EmptyKey;
+  }
+  static inline clang::syntax::pseudo::Item getTombstoneKey() {
+    static clang::syntax::pseudo::Item TombstoneKey(-2, 0, 0);
+    return TombstoneKey;
+  }
+  static unsigned getHashValue(const clang::syntax::pseudo::Item &I) {
+    return hash_value(I);
+  }
+  static bool isEqual(const clang::syntax::pseudo::Item &LHS,
+                      const clang::syntax::pseudo::Item &RHS) {
+    return LHS == RHS;
+  }
+};
+} // namespace llvm
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D119172: [pseudo] Implement LRGraph

Reply via email to