[PATCH] D127357: [pseudo] wip/prototype: use LR0 instead of SLR1 table

Sam McCall via Phabricator via cfe-commits Tue, 21 Jun 2022 07:06:14 -0700

sammccall updated this revision to Diff 438693.
sammccall added a comment.

Deeper version of LR0, still prototype-quality


- use hashtables instead of binary search where appropriate
- simplify GLR accordingly
- add fast-path reduce to reclaim some of the performance (though "reclaim" is 
maybe wrong - this optimization can with hindsight apply to SLR1 also)

Numbers to follow.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D127357/new/

https://reviews.llvm.org/D127357

Files:
  clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/cxx/CXX.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/lr-build-basic.test
  clang-tools-extra/pseudo/test/lr-build-conflicts.test
  clang-tools-extra/pseudo/tool/ClangPseudo.cpp

Index: clang-tools-extra/pseudo/tool/ClangPseudo.cpp
===================================================================
--- clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -108,7 +108,7 @@
       llvm::outs() << G->dump();
     if (PrintGraph)
       llvm::outs() << clang::pseudo::LRGraph::buildLR0(*G).dumpForTests(*G);
-    auto LRTable = clang::pseudo::LRTable::buildSLR(*G);
+    auto LRTable = clang::pseudo::LRTable::buildLR0(*G);
     if (PrintTable)
       llvm::outs() << LRTable.dumpForTests(*G);
     if (PrintStatistics)
Index: clang-tools-extra/pseudo/test/lr-build-conflicts.test
===================================================================
--- clang-tools-extra/pseudo/test/lr-build-conflicts.test
+++ clang-tools-extra/pseudo/test/lr-build-conflicts.test
@@ -35,12 +35,10 @@
 # TABLE-NEXT: State 1
 # TABLE-NEXT:     '-': shift state 3
 # TABLE-NEXT: State 2
-# TABLE-NEXT:     'EOF': reduce by rule 1 'expr := IDENTIFIER'
-# TABLE-NEXT:     '-': reduce by rule 1 'expr := IDENTIFIER'
+# TABLE-NEXT:     'EOD': reduce by rule 1 'expr := IDENTIFIER'
 # TABLE-NEXT: State 3
 # TABLE-NEXT:     'IDENTIFIER': shift state 2
 # TABLE-NEXT:     'expr': go to state 4
 # TABLE-NEXT: State 4
-# TABLE-NEXT:     'EOF': reduce by rule 0 'expr := expr - expr'
+# TABLE-NEXT:     'EOD': reduce by rule 0 'expr := expr - expr'
 # TABLE-NEXT:     '-': shift state 3
-# TABLE-NEXT:     '-': reduce by rule 0 'expr := expr - expr'
Index: clang-tools-extra/pseudo/test/lr-build-basic.test
===================================================================
--- clang-tools-extra/pseudo/test/lr-build-basic.test
+++ clang-tools-extra/pseudo/test/lr-build-basic.test
@@ -23,6 +23,6 @@
 # TABLE-NEXT:     'id': go to state 2
 # TABLE-NEXT: State 1
 # TABLE-NEXT: State 2
-# TABLE-NEXT:     'EOF': reduce by rule 1 'expr := id'
+# TABLE-NEXT:     'EOD': reduce by rule 1 'expr := id'
 # TABLE-NEXT: State 3
-# TABLE-NEXT:     'EOF': reduce by rule 0 'id := IDENTIFIER'
+# TABLE-NEXT:     'EOD': reduce by rule 0 'id := IDENTIFIER'
Index: clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
+++ clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
@@ -9,35 +9,12 @@
 #include "clang-pseudo/Grammar.h"
 #include "clang-pseudo/LRGraph.h"
 #include "clang-pseudo/LRTable.h"
-#include "clang/Basic/TokenKinds.h"
 #include <cstdint>
 
-namespace llvm {
-template <> struct DenseMapInfo<clang::pseudo::LRTable::Entry> {
-  using Entry = clang::pseudo::LRTable::Entry;
-  static inline Entry getEmptyKey() {
-    static Entry E{static_cast<clang::pseudo::SymbolID>(-1), 0,
-                   clang::pseudo::LRTable::Action::sentinel()};
-    return E;
-  }
-  static inline Entry getTombstoneKey() {
-    static Entry E{static_cast<clang::pseudo::SymbolID>(-2), 0,
-                   clang::pseudo::LRTable::Action::sentinel()};
-    return E;
-  }
-  static unsigned getHashValue(const Entry &I) {
-    return llvm::hash_combine(I.State, I.Symbol, I.Act.opaque());
-  }
-  static bool isEqual(const Entry &LHS, const Entry &RHS) {
-    return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol &&
-           LHS.Act == RHS.Act;
-  }
-};
-} // namespace llvm
-
 namespace clang {
 namespace pseudo {
 
+#if 0
 class LRTable::Builder {
 public:
   Builder(llvm::ArrayRef<std::pair<SymbolID, StateID>> StartStates)
@@ -134,6 +111,41 @@
   }
   return std::move(Build).build(G.table(), Graph.states().size());
 }
+#endif
+
+LRTable LRTable::buildLR0(const Grammar &G) {
+  auto Graph = LRGraph::buildLR0(G);
+  assert(Graph.states().size() <= (1 << StateBits) &&
+         "Graph states execceds the maximum limit!");
+
+  std::vector<std::pair<StateID, RuleID>> Reduce;
+  llvm::DenseMap<StateSymbol, StateID> Shift;
+  llvm::DenseMap<StateSymbol, StateID> GoTo;
+
+  for (const auto &T : Graph.edges()) {
+    (isToken(T.Label) ? Shift : GoTo)
+        .try_emplace(StateSymbol{T.Src, T.Label}, T.Dst);
+  }
+
+  for (StateID SID = 0; SID < Graph.states().size(); ++SID) {
+    for (const Item &I : Graph.states()[SID].Items) {
+      if (!I.hasNext()) {
+        // If we've just parsed the start symbol, this means we successfully
+        // parse the input. We don't add the reduce action of `_ :=
+        // start_symbol` in the LRTable (the GLR parser handles it
+        // specifically).
+        if (G.lookupRule(I.rule()).Target == G.underscore())
+          continue;
+        // If we've reached the end of a rule A := ..., then we can reduce.
+        Reduce.push_back({SID, I.rule()});
+      }
+    }
+  }
+  return LRTable(
+      std::move(Shift),
+      OffsetTable<StateID, RuleID>(std::move(Reduce), Graph.states().size()),
+      std::move(GoTo), Graph.startStates());
+}
 
 } // namespace pseudo
 } // namespace clang
Index: clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
+++ clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
@@ -17,28 +17,13 @@
 namespace clang {
 namespace pseudo {
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) {
-  switch (A.kind()) {
-  case LRTable::Action::Shift:
-    return OS << llvm::formatv("shift state {0}", A.getShiftState());
-  case LRTable::Action::Reduce:
-    return OS << llvm::formatv("reduce by rule {0}", A.getReduceRule());
-  case LRTable::Action::GoTo:
-    return OS << llvm::formatv("go to state {0}", A.getGoToState());
-  case LRTable::Action::Sentinel:
-    llvm_unreachable("unexpected Sentinel action kind!");
-  }
-  llvm_unreachable("unexpected action kind!");
-}
-
 std::string LRTable::dumpStatistics() const {
   return llvm::formatv(R"(
 Statistics of the LR parsing table:
-    number of states: {0}
-    number of actions: {1}
-    size of the table (bytes): {2}
+    number of actions: shift={0} reduce={1} goto={2}
+    size of the table (bytes): {3}
 )",
-                       StateOffset.size() - 1, Actions.size(), bytes())
+                       Shift.size(), Reduce.size(), Goto.size(), bytes())
       .str();
 }
 
@@ -46,6 +31,7 @@
   std::string Result;
   llvm::raw_string_ostream OS(Result);
   OS << "LRTable:\n";
+#if 0
   for (StateID S = 0; S < StateOffset.size() - 1; ++S) {
     OS << llvm::formatv("State {0}\n", S);
     for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) {
@@ -69,43 +55,10 @@
                                     getGoToState(S, NontermID));
     }
   }
+#endif
   return OS.str();
 }
 
-llvm::ArrayRef<LRTable::Action> LRTable::getActions(StateID State,
-                                                    SymbolID Terminal) const {
-  assert(pseudo::isToken(Terminal) && "expect terminal symbol!");
-  return find(State, Terminal);
-}
-
-LRTable::StateID LRTable::getGoToState(StateID State,
-                                       SymbolID Nonterminal) const {
-  assert(pseudo::isNonterminal(Nonterminal) && "expected nonterminal symbol!");
-  auto Result = find(State, Nonterminal);
-  assert(Result.size() == 1 && Result.front().kind() == Action::GoTo);
-  return Result.front().getGoToState();
-}
-
-llvm::ArrayRef<LRTable::Action> LRTable::find(StateID Src, SymbolID ID) const {
-  assert(Src + 1u < StateOffset.size());
-  std::pair<size_t, size_t> Range =
-      std::make_pair(StateOffset[Src], StateOffset[Src + 1]);
-  auto SymbolRange = llvm::makeArrayRef(Symbols.data() + Range.first,
-                                        Symbols.data() + Range.second);
-
-  assert(llvm::is_sorted(SymbolRange) &&
-         "subrange of the Symbols should be sorted!");
-  const LRTable::StateID *Start =
-      llvm::partition_point(SymbolRange, [&ID](SymbolID S) { return S < ID; });
-  if (Start == SymbolRange.end())
-    return {};
-  const LRTable::StateID *End = Start;
-  while (End != SymbolRange.end() && *End == ID)
-    ++End;
-  return llvm::makeArrayRef(&Actions[Start - Symbols.data()],
-                            /*length=*/End - Start);
-}
-
 LRTable::StateID LRTable::getStartState(SymbolID Target) const {
   assert(llvm::is_sorted(StartStates) && "StartStates must be sorted!");
   auto It = llvm::partition_point(
Index: clang-tools-extra/pseudo/lib/cxx/CXX.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/cxx/CXX.cpp
+++ clang-tools-extra/pseudo/lib/cxx/CXX.cpp
@@ -25,7 +25,7 @@
 }
 
 const LRTable &getLRTable() {
-  static LRTable *Table = new LRTable(LRTable::buildSLR(getGrammar()));
+  static LRTable *Table = new LRTable(LRTable::buildLR0(getGrammar()));
   return *Table;
 }
 
Index: clang-tools-extra/pseudo/lib/GLR.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GLR.cpp
+++ clang-tools-extra/pseudo/lib/GLR.cpp
@@ -46,67 +46,35 @@
   auto &GSS = Params.GSStack;
 
   // Lists of active shift, reduce actions.
-  std::vector<ParseStep> PendingShift, PendingReduce;
-  auto AddSteps = [&](const GSS::Node *Head, SymbolID NextTok) {
-    for (const auto &Action : Params.Table.getActions(Head->State, NextTok)) {
-      switch (Action.kind()) {
-      case LRTable::Action::Shift:
-        PendingShift.push_back({Head, Action});
-        break;
-      case LRTable::Action::Reduce:
-        PendingReduce.push_back({Head, Action});
-        break;
-      default:
-        llvm_unreachable("unexpected action kind!");
-      }
-    }
-  };
   StateID StartState = Params.Table.getStartState(StartSymbol);
-  std::vector<const GSS::Node *> NewHeads = {
+  std::vector<const GSS::Node *> Heads = {
       GSS.addNode(/*State=*/StartState,
                   /*ForestNode=*/nullptr, {})};
+  std::vector<const GSS::Node *> NextHeads;
   auto MaybeGC = [&, Roots(std::vector<const GSS::Node *>{}), I(0u)]() mutable {
-    assert(PendingShift.empty() && PendingReduce.empty() &&
-           "Running GC at the wrong time!");
-
+    assert(NextHeads.empty() && "Running GC at the wrong time!");
     if (++I != 20) // Run periodically to balance CPU and memory usage.
       return;
     I = 0;
 
     // We need to copy the list: Roots is consumed by the GC.
-    Roots = NewHeads;
+    Roots = Heads;
     GSS.gc(std::move(Roots));
   };
   for (const ForestNode &Terminal : Terminals) {
     LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n",
                                              G.symbolName(Terminal.symbol()),
                                              Terminal.symbol()));
-    for (const auto *Head : NewHeads)
-      AddSteps(Head, Terminal.symbol());
-    NewHeads.clear();
-    glrReduce(PendingReduce, Params,
-              [&](const GSS::Node * NewHead) {
-                // A reduce will enable more steps.
-                AddSteps(NewHead, Terminal.symbol());
-              });
-
-    glrShift(PendingShift, Terminal, Params,
-             [&](const GSS::Node *NewHead) { NewHeads.push_back(NewHead); });
+    glrShift(Heads, Terminal, Params, NextHeads);
+    std::swap(Heads, NextHeads);
+    glrReduce(Heads, Params);
+    NextHeads.clear();
     MaybeGC();
   }
-  LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next is eof\n"));
-  for (const auto *Heads : NewHeads)
-    AddSteps(Heads, tokenSymbol(tok::eof));
+  LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
 
   StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol);
   // Collect new heads created from the final reduce.
-  std::vector<const GSS::Node*> Heads;
-  glrReduce(PendingReduce, Params, [&](const GSS::Node *NewHead) {
-    Heads.push_back(NewHead);
-    // A reduce will enable more steps.
-    AddSteps(NewHead, tokenSymbol(tok::eof));
-  });
-
   const ForestNode *Result = nullptr;
   for (const auto *Head : Heads) {
     if (Head->State == AcceptState) {
@@ -138,42 +106,40 @@
 // After the shift action, the GSS is:
 //   0---1---2---4
 //       â---3---â
-void glrShift(std::vector<ParseStep> &PendingShift, const ForestNode &NewTok,
-              const ParseParams &Params, NewHeadCallback NewHeadCB) {
+void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
+              const ForestNode &NewTok, const ParseParams &Params,
+              std::vector<const GSS::Node *> &NewHeads) {
   assert(NewTok.kind() == ForestNode::Terminal);
-  assert(llvm::all_of(PendingShift,
-                      [](const ParseStep &Step) {
-                        return Step.Action.kind() == LRTable::Action::Shift;
-                      }) &&
-         "Pending shift actions must be shift actions");
   LLVM_DEBUG(llvm::dbgs() << llvm::formatv("  Shift {0} ({1} active heads):\n",
                                            Params.G.symbolName(NewTok.symbol()),
-                                           PendingShift.size()));
+                                           OldHeads.size()));
 
   // We group pending shifts by their target state so we can merge them.
-  llvm::stable_sort(PendingShift, [](const ParseStep &L, const ParseStep &R) {
-    return L.Action.getShiftState() < R.Action.getShiftState();
-  });
-  auto Rest = llvm::makeArrayRef(PendingShift);
+  llvm::SmallVector<std::pair<StateID, const GSS::Node *>, 8> Shifts;
+  for (const auto* H : OldHeads)
+    if (auto S = Params.Table.getShiftState(H->State, NewTok.symbol()))
+      Shifts.push_back({*S, H});
+  llvm::stable_sort(Shifts, llvm::less_first{});
+
+  auto Rest = llvm::makeArrayRef(Shifts);
   llvm::SmallVector<const GSS::Node *> Parents;
   while (!Rest.empty()) {
     // Collect the batch of PendingShift that have compatible shift states.
     // Their heads become TempParents, the parents of the new GSS node.
-    StateID NextState = Rest.front().Action.getShiftState();
+    StateID NextState = Rest.front().first;
 
     Parents.clear();
     for (const auto &Base : Rest) {
-      if (Base.Action.getShiftState() != NextState)
+      if (Base.first != NextState)
         break;
-      Parents.push_back(Base.Head);
+      Parents.push_back(Base.second);
     }
     Rest = Rest.drop_front(Parents.size());
 
     LLVM_DEBUG(llvm::dbgs() << llvm::formatv("    --> S{0} ({1} heads)\n",
                                              NextState, Parents.size()));
-    NewHeadCB(Params.GSStack.addNode(NextState, &NewTok, Parents));
+    NewHeads.push_back(Params.GSStack.addNode(NextState, &NewTok, Parents));
   }
-  PendingShift.clear();
 }
 
 namespace {
@@ -231,8 +197,8 @@
 //   After reducing 3 by `pointer := class-name STAR` and
 //                  2 by`enum-name := class-name STAR`:
 //     0--5(pointer)       // 5 is goto(0, pointer)
-void glrReduce(std::vector<ParseStep> &PendingReduce, const ParseParams &Params,
-               NewHeadCallback NewHeadCB) {
+void glrReduce(std::vector<const GSS::Node *> &Heads,
+               const ParseParams &Params) {
   // There are two interacting complications:
   // 1.  Performing one reduce can unlock new reduces on the newly-created head.
   // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes).
@@ -311,10 +277,36 @@
     };
     DFS(Head, 0, DFS);
   };
+  unsigned PoppedHeads = 0;
+  auto FastPath = [&]() -> bool {
+    if (!Sequences.empty() || Heads.size() != PoppedHeads + 1)
+      return false;
+    const GSS::Node *Head = Heads.back();
+    auto Rules = Params.Table.getReduceRules(Head->State);
+    if (Rules.size() != 1)
+      return false;
+    const auto &Rule = Params.G.lookupRule(Rules.front());
+    const GSS::Node *Base = Head;
+    TempSequence.resize_for_overwrite(Rule.Size);
+    for (unsigned I = 0; I < Rule.Size; ++I) {
+      if (Base->parents().size() != 1)
+        return false;
+      TempSequence[Rule.Size - 1 - I] = Base->Payload;
+      Base = Base->parents().front();
+    }
+    const ForestNode *Parsed =
+        &Params.Forest.createSequence(Rule.Target, Rules.front(), TempSequence);
+    StateID NextState = Params.Table.getGoToState(Base->State, Rule.Target);
+    Heads.push_back(Params.GSStack.addNode(NextState, Parsed, {Base}));
+    return true;
+  };
   auto PopPending = [&] {
-    for (const ParseStep &Pending : PendingReduce)
-      Pop(Pending.Head, Pending.Action.getReduceRule());
-    PendingReduce.clear();
+    for (; PoppedHeads < Heads.size(); ++PoppedHeads) {
+      if (FastPath())
+        continue;
+      for (RuleID R : Params.Table.getReduceRules(Heads[PoppedHeads]->State))
+        Pop(Heads[PoppedHeads], R);
+    }
   };
 
   std::vector<std::pair</*Goto*/ StateID, const GSS::Node *>> FamilyBases;
@@ -379,9 +371,7 @@
       }
       BasesLeft = BasesLeft.drop_front(Parents.size());
 
-      // Invoking the callback for new heads, a real GLR parser may add new
-      // reduces to the PendingReduce queue!
-      NewHeadCB(Params.GSStack.addNode(NextState, Parsed, Parents));
+      Heads.push_back(Params.GSStack.addNode(NextState, Parsed, Parents));
     }
     PopPending();
   }
Index: clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
@@ -38,9 +38,39 @@
 
 #include "clang-pseudo/Grammar.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Capacity.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cstdint>
 #include <vector>
 
+namespace clang {
+namespace pseudo {
+struct StateSymbol {
+  using StateID = uint16_t; // XXX
+  StateID State;
+  SymbolID Symbol;
+};
+} // namespace pseudo
+} // namespace clang
+namespace llvm {
+template <> struct llvm::DenseMapInfo<clang::pseudo::StateSymbol> {
+  using StateSymbol = clang::pseudo::StateSymbol;
+  static inline StateSymbol getEmptyKey() {
+    return StateSymbol{StateSymbol::StateID(-1), 0};
+  }
+  static inline StateSymbol getTombstoneKey() {
+    return StateSymbol{StateSymbol::StateID(-1), 1};
+  }
+  static unsigned getHashValue(const StateSymbol &Val) {
+    return (Val.State * 2754435761U) ^ Val.Symbol; // Knuth hash
+  }
+  static bool isEqual(const StateSymbol &LHS, const StateSymbol &RHS) {
+    return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol;
+  }
+};
+} // namespace llvm
 namespace clang {
 namespace pseudo {
 
@@ -60,79 +90,28 @@
   using StateID = uint16_t;
   static constexpr unsigned StateBits = 13;
 
-  // Action represents the terminal and nonterminal actions, it combines the
-  // entry of the ACTION and GOTO tables from the LR literature.
-  class Action {
-  public:
-    enum Kind : uint8_t {
-      Sentinel = 0,
-      // Terminal actions, corresponding to entries of ACTION table.
-
-      // Shift to state n: move forward with the lookahead, and push state n
-      // onto the state stack.
-      // A shift is a forward transition, and the value n is the next state that
-      // the parser is to enter.
-      Shift,
-      // Reduce by a rule: pop the state stack.
-      Reduce,
-
-      // NOTE: there are no typical accept actions in the LRtable, accept
-      // actions are handled specifically in the parser -- if the parser
-      // reaches to a target state (which is goto(StartState, StartSymbol)) at
-      // the EOF token after a reduce, this indicates the input has been parsed
-      // as the StartSymbol successfully.
-
-      // Nonterminal actions, corresponding to entry of GOTO table.
-
-      // Go to state n: push state n onto the state stack.
-      // Similar to Shift, but it is a nonterminal forward transition.
-      GoTo,
-    };
-
-    static Action goTo(StateID S) { return Action(GoTo, S); }
-    static Action shift(StateID S) { return Action(Shift, S); }
-    static Action reduce(RuleID RID) { return Action(Reduce, RID); }
-    static Action sentinel() { return Action(Sentinel, 0); }
-
-    StateID getShiftState() const {
-      assert(kind() == Shift);
-      return Value;
-    }
-    StateID getGoToState() const {
-      assert(kind() == GoTo);
-      return Value;
-    }
-    RuleID getReduceRule() const {
-      assert(kind() == Reduce);
-      return Value;
-    }
-    Kind kind() const { return static_cast<Kind>(K); }
-
-    bool operator==(const Action &L) const { return opaque() == L.opaque(); }
-    uint16_t opaque() const { return K << ValueBits | Value; };
-
-  private:
-    Action(Kind K1, unsigned Value) : K(K1), Value(Value) {}
-    static constexpr unsigned ValueBits = StateBits;
-    static constexpr unsigned KindBits = 3;
-    static_assert(ValueBits >= RuleBits, "Value must be able to store RuleID");
-    static_assert(KindBits + ValueBits <= 16,
-                  "Must be able to store kind and value efficiently");
-    uint16_t K : KindBits;
-    // Either StateID or RuleID, depending on the Kind.
-    uint16_t Value : ValueBits;
-  };
-
-  // Returns all available actions for the given state on a terminal.
-  // Expected to be called by LR parsers.
-  llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
-  // Returns the state after we reduce a nonterminal.
-  // Expected to be called by LR parsers.
-  StateID getGoToState(StateID State, SymbolID Nonterminal) const;
-
-  // Looks up available actions.
-  // Returns empty if no available actions in the table.
-  llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
+  // Return the list of reductions applicable from a given state.
+  // These correspond to items with the dot at the end.
+  llvm::ArrayRef<RuleID> getReduceRules(StateID State) const {
+    return Reduce.find(State);
+  }
+  // Return the state to transition to after shifting a token Terminal.
+  // Returns None if this terminal is not legal here.
+  llvm::Optional<StateID> getShiftState(StateID From, SymbolID Terminal) const {
+    auto It = Shift.find(StateSymbol{From, Terminal});
+    if (It == Shift.end())
+      return llvm::None;
+    return It->second;
+  }
+  // Return the state to transition to after reducing a symbol Nonterminal.
+  // REQUIRES: this nonterminal is legal here.
+  StateID getGoToState(StateID From, SymbolID Nonterminal) const {
+    auto It = Goto.find(StateSymbol{From, Nonterminal});
+    if (It == Goto.end())
+      llvm::errs() << "From=" << From << ", NT=" << Nonterminal << "\n";
+    assert(It != Goto.end());
+    return It->second;
+  }
 
   // Returns the state from which the LR parser should start to parse the input
   // tokens as the given StartSymbol.
@@ -146,17 +125,18 @@
   StateID getStartState(SymbolID StartSymbol) const;
 
   size_t bytes() const {
-    return sizeof(*this) + Actions.capacity() * sizeof(Action) +
-           Symbols.capacity() * sizeof(SymbolID) +
-           StateOffset.capacity() * sizeof(uint32_t);
+    return sizeof(*this) + llvm::capacity_in_bytes(Shift) +
+           llvm::capacity_in_bytes(Goto) + Reduce.bytes();
   }
 
   std::string dumpStatistics() const;
   std::string dumpForTests(const Grammar &G) const;
 
+  static LRTable buildLR0(const Grammar &G);
+
+#if 0
   // Build a SLR(1) parsing table.
   static LRTable buildSLR(const Grammar &G);
-
   class Builder;
   // Represents an entry in the table, used for building the LRTable.
   struct Entry {
@@ -166,27 +146,62 @@
   };
   // Build a specifid table for testing purposes.
   static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);
+#endif
 
 private:
-  // Conceptually the LR table is a multimap from (State, SymbolID) => Action.
-  // Our physical representation is quite different for compactness.
-
-  // Index is StateID, value is the offset into Symbols/Actions
-  // where the entries for this state begin.
-  // Give a state id, the corresponding half-open range of Symbols/Actions is
-  // [StateOffset[id], StateOffset[id+1]).
-  std::vector<uint32_t> StateOffset;
-  // Parallel to Actions, the value is SymbolID (columns of the matrix).
-  // Grouped by the StateID, and only subranges are sorted.
-  std::vector<SymbolID> Symbols;
-  // A flat list of available actions, sorted by (State, SymbolID).
-  std::vector<Action> Actions;
+  // A multimap from K => V, where Ks form a dense range [0, n).
+  // A flat array stores the values:
+  //   Values = [ values for k=0 | values for k=1 | values for k=2 | ... ]
+  // And another stores the index for each K:
+  //   Keys = [ index for k=0, index for k=1, ... , Values.size()]
+  // Lookup[k] is Values[ Keys[k]...Keys[k+1] ]
+  template <typename K, typename V> struct OffsetTable {
+    std::vector<uint32_t> Keys;
+    std::vector<V> Values;
+
+    OffsetTable(std::vector<std::pair<K, V>> &&Pairs, K Limit) {
+      llvm::stable_sort(Pairs, llvm::less_first{});
+      Keys.reserve(Limit + 1);
+      Values.reserve(Pairs.size());
+      unsigned I = 0;
+      for (K Key = 0; Key < Limit; ++Key) {
+        Keys.push_back(Values.size());
+        while (I < Pairs.size() && Pairs[I].first == Key)
+          Values.push_back(Pairs[I++].second);
+      }
+      Keys.push_back(Values.size());
+      assert(Values.size() == Pairs.size());
+      assert(Keys.size() == Limit + 1);
+    }
+
+    size_t bytes() const {
+      return sizeof(*this) + llvm::capacity_in_bytes(Keys) +
+             llvm::capacity_in_bytes(Values);
+    }
+    size_t size() const { return Values.size(); }
+
+    llvm::ArrayRef<V> find(K Key) const {
+      return llvm::makeArrayRef(&Values[Keys[Key]], &Values[Keys[Key + 1]]);
+    }
+  };
+
+  llvm::DenseMap<StateSymbol, StateID> Shift;
+  OffsetTable<StateID, RuleID> Reduce;
+  llvm::DenseMap<StateSymbol, StateID> Goto;
   // A sorted table, storing the start state for each target parsing symbol.
   std::vector<std::pair<SymbolID, StateID>> StartStates;
+
+  LRTable(llvm::DenseMap<StateSymbol, StateID> Shift,
+          OffsetTable<StateID, RuleID> Reduce,
+          llvm::DenseMap<StateSymbol, StateID> GoTo,
+          std::vector<std::pair<SymbolID, StateID>> StartStates)
+      : Shift(std::move(Shift)), Reduce(std::move(Reduce)),
+        Goto(std::move(GoTo)), StartStates(std::move(StartStates)) {}
 };
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
 
 } // namespace pseudo
 } // namespace clang
 
+namespace llvm {} // namespace llvm
+
 #endif // CLANG_PSEUDO_LRTABLE_H
Index: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -132,34 +132,23 @@
 const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params,
                            SymbolID StartSymbol);
 
-// An active stack head can have multiple available actions (reduce/reduce
-// actions, reduce/shift actions).
-// A step is any one action applied to any one stack head.
-struct ParseStep {
-  // A specific stack head.
-  const GSS::Node *Head = nullptr;
-  // An action associated with the head.
-  LRTable::Action Action = LRTable::Action::sentinel();
-};
-// A callback is invoked whenever a new GSS head is created during the GLR
-// parsing process (glrShift, or glrReduce).
-using NewHeadCallback = std::function<void(const GSS::Node *)>;
 // Apply all PendingShift actions on a given GSS state, newly-created heads are
 // passed to the callback.
 //
 // When this function returns, PendingShift is empty.
 //
 // Exposed for testing only.
-void glrShift(std::vector<ParseStep> &PendingShift, const ForestNode &NextTok,
-              const ParseParams &Params, NewHeadCallback NewHeadCB);
+void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
+              const ForestNode &NextTok, const ParseParams &Params,
+              std::vector<const GSS::Node *> &NewHeads);
 // Applies PendingReduce actions, until no more reduce actions are available.
 //
 // When this function returns, PendingReduce is empty. Calls to NewHeadCB may
 // add elements to PendingReduce
 //
 // Exposed for testing only.
-void glrReduce(std::vector<ParseStep> &PendingReduce, const ParseParams &Params,
-               NewHeadCallback NewHeadCB);
+void glrReduce(std::vector<const GSS::Node *> &Heads,
+               const ParseParams &Params);
 
 } // namespace pseudo
 } // namespace clang
Index: clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
===================================================================
--- clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
+++ clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
@@ -77,11 +77,11 @@
 }
 BENCHMARK(parseBNF);
 
-static void buildSLR(benchmark::State &State) {
+static void buildLR0(benchmark::State &State) {
   for (auto _ : State)
-    LRTable::buildSLR(*G);
+    LRTable::buildLR0(*G);
 }
-BENCHMARK(buildSLR);
+BENCHMARK(buildLR0);
 
 TokenStream lexAndPreprocess() {
   clang::LangOptions LangOpts = genericLangOpts();
@@ -129,7 +129,7 @@
 BENCHMARK(preprocess);
 
 static void glrParse(benchmark::State &State) {
-  LRTable Table = clang::pseudo::LRTable::buildSLR(*G);
+  LRTable Table = clang::pseudo::LRTable::buildLR0(*G);
   SymbolID StartSymbol = *G->findNonterminal("translation-unit");
   TokenStream Stream = lexAndPreprocess();
   for (auto _ : State) {
@@ -143,7 +143,7 @@
 BENCHMARK(glrParse);
 
 static void full(benchmark::State &State) {
-  LRTable Table = clang::pseudo::LRTable::buildSLR(*G);
+  LRTable Table = clang::pseudo::LRTable::buildLR0(*G);
   SymbolID StartSymbol = *G->findNonterminal("translation-unit");
   for (auto _ : State) {
     TokenStream Stream = lexAndPreprocess();

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D127357: [pseudo] wip/prototype: use LR0 instead of SLR1 table

Reply via email to