hokein created this revision. hokein added a reviewer: sammccall. Herald added subscribers: mgrang, mgorny. Herald added a project: All. hokein requested review of this revision. Herald added a subscriber: alextsao1999. Herald added a project: clang-tools-extra.
- Add annotation handling ([key=value]) in the BNF grammar parser; - Define and setup the API in the grammar for attributes; - Implement a builtin guard for two simple c++ contexual-override/final use cases; Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D126536 Files: clang-tools-extra/pseudo/gen/Main.cpp clang-tools-extra/pseudo/include/clang-pseudo/GLR.h clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h clang-tools-extra/pseudo/lib/GLR.cpp clang-tools-extra/pseudo/lib/cxx.bnf clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt clang-tools-extra/pseudo/lib/cxx/CXX.cpp clang-tools-extra/pseudo/lib/grammar/Grammar.cpp clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp clang-tools-extra/pseudo/unittests/GLRTest.cpp clang-tools-extra/pseudo/unittests/GrammarTest.cpp
Index: clang-tools-extra/pseudo/unittests/GrammarTest.cpp =================================================================== --- clang-tools-extra/pseudo/unittests/GrammarTest.cpp +++ clang-tools-extra/pseudo/unittests/GrammarTest.cpp @@ -99,6 +99,14 @@ EXPECT_LT(ruleFor("x"), ruleFor("_")); } +TEST_F(GrammarTest, Annotation) { + build(R"bnf( + _ := IDENTIFIER [guard=override] + )bnf"); + ASSERT_TRUE(Diags.empty()); + EXPECT_NE(G->lookupRule(ruleFor("_")).Guard, NoneAttribute); +} + TEST_F(GrammarTest, Diagnostics) { build(R"cpp( _ := ,_opt @@ -110,17 +118,25 @@ # cycle a := b b := a + + _ := IDENTIFIER [guard=override;unknown=value] + _ := IDENTIFIER [guard=override] ; + _ := [guard=override] ; )cpp"); EXPECT_EQ(G->underscore(), id("_")); - EXPECT_THAT(Diags, UnorderedElementsAre( - "Rule '_ := ,_opt' has a nullable RHS", - "Rule 'null := ' has a nullable RHS", - "No rules for nonterminal: undefined-sym", - "Failed to parse 'invalid': no separator :=", - "Token-like name IDENFIFIE is used as a nonterminal", - "No rules for nonterminal: IDENFIFIE", - "The grammar contains a cycle involving symbol a")); + EXPECT_THAT( + Diags, UnorderedElementsAre( + "Rule '_ := ,_opt' has a nullable RHS", + "Rule 'null := ' has a nullable RHS", + "No rules for nonterminal: undefined-sym", + "Failed to parse 'invalid': no separator :=", + "Token-like name IDENFIFIE is used as a nonterminal", + "No rules for nonterminal: IDENFIFIE", + "The grammar contains a cycle involving symbol a", + "Unknown attribute key 'unknown'", + "Attribute 'guard' can only be specified at the end of a rule", + "Attributes are not allowed at the beginning of the rule")); } TEST_F(GrammarTest, FirstAndFollowSets) { Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp =================================================================== --- clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -158,7 +158,7 @@ std::vector<ParseStep> PendingReduce = { {GSSNode1, Action::reduce(ruleFor("class-name"))}, {GSSNode1, Action::reduce(ruleFor("enum-name"))}}; - glrReduce(PendingReduce, {*G, Table, Arena, GSStack}, + glrReduce(PendingReduce, TokenStream(), {*G, Table, Arena, GSStack}, captureNewHeads()); EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre( @@ -195,7 +195,7 @@ {/*State=*/3, id("ptr-operator"), Action::goTo(/*NextState=*/6)}}); std::vector<ParseStep> PendingReduce = { {GSSNode4, Action::reduce(ruleFor("ptr-operator"))}}; - glrReduce(PendingReduce, {*G, Table, Arena, GSStack}, + glrReduce(PendingReduce, TokenStream(), {*G, Table, Arena, GSStack}, captureNewHeads()); EXPECT_THAT(NewHeadResults, @@ -250,7 +250,7 @@ { GSSNode4, Action::reduce(/*RuleID=*/1) // type-name := enum-name }}; - glrReduce(PendingReduce, {*G, Table, Arena, GSStack}, + glrReduce(PendingReduce, TokenStream(), {*G, Table, Arena, GSStack}, captureNewHeads()); // Verify that the stack heads are joint at state 5 after reduces. @@ -306,7 +306,7 @@ { GSSNode4, Action::reduce(/*RuleID=*/1) // pointer := enum-name * }}; - glrReduce(PendingReduce, {*G, Table, Arena, GSStack}, + glrReduce(PendingReduce, TokenStream(), {*G, Table, Arena, GSStack}, captureNewHeads()); EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre( Index: clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp =================================================================== --- clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp +++ clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp @@ -47,16 +47,28 @@ // Assemble the name->ID and ID->nonterminal name maps. llvm::DenseSet<llvm::StringRef> UniqueNonterminals; llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds; + + llvm::DenseSet<llvm::StringRef> UniqueAttrKeys; + llvm::DenseSet<llvm::StringRef> UniqueAttrValues = {""}; + llvm::DenseMap<llvm::StringRef, AttributeID> AttrValueIDs; + for (uint16_t I = 0; I < NumTerminals; ++I) SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I))); auto Consider = [&](llvm::StringRef Name) { if (!SymbolIds.count(Name)) UniqueNonterminals.insert(Name); }; + auto ConsiderAttrValue = [&](llvm::StringRef AttrValueName) { + if (!AttrValueIDs.count(AttrValueName)) + UniqueAttrValues.insert(AttrValueName); + }; for (const auto &Spec : Specs) { Consider(Spec.Target); - for (const RuleSpec::Element &Elt : Spec.Sequence) + for (const RuleSpec::Element &Elt : Spec.Sequence) { Consider(Elt.Symbol); + for (const auto& Attr : Elt.Attributes) + ConsiderAttrValue(Attr.second); + } } llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) { T->Nonterminals.emplace_back(); @@ -68,6 +80,13 @@ const GrammarTable::Nonterminal &R) { return L.Name < R.Name; }); + + llvm::for_each(UniqueAttrValues, [&T](llvm::StringRef Name) { + T->Attributes.emplace_back(); + T->Attributes.back() = Name.str(); + }); + llvm::sort(T->Attributes); + // Build name -> ID maps for nonterminals. for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID); @@ -75,17 +94,42 @@ // Convert the rules. T->Rules.reserve(Specs.size()); std::vector<SymbolID> Symbols; + std::vector<AttributeID> Attributes; auto Lookup = [SymbolIds](llvm::StringRef Name) { auto It = SymbolIds.find(Name); assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!"); return It->second; }; + auto LookupAttr = [](llvm::StringRef Name, + llvm::ArrayRef<std::string> SortedArray) { + assert(llvm::is_sorted(SortedArray) && "Array is not sorted!"); + const auto *It = llvm::partition_point( + SortedArray, [&](llvm::StringRef X) { return X < Name; }); + assert(It != SortedArray.end() && *It == Name && + "Didn't find the symbol in AttrValues!"); + return It - SortedArray.begin(); + }; for (const auto &Spec : Specs) { assert(Spec.Sequence.size() <= Rule::MaxElements); Symbols.clear(); - for (const RuleSpec::Element &Elt : Spec.Sequence) + AttributeID Guard = NoneAttribute; + for (const RuleSpec::Element &Elt : Spec.Sequence) { Symbols.push_back(Lookup(Elt.Symbol)); - T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols)); + + for (const auto& AttrKeyAndValue : Elt.Attributes) { + switch(AttrKeyAndValue.first) { + case AttributeKey::Guard: + if (&Elt != &Spec.Sequence.back()) { + Diagnostics.push_back("Attribute 'guard' can only be specified " + "at the end of a rule"); + continue; + } + Guard = LookupAttr(AttrKeyAndValue.second, T->Attributes); + break; + } + } + } + T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols, Guard)); } assert(T->Rules.size() < (1 << RuleBits) && "Too many rules to fit in RuleID bits!"); @@ -164,6 +208,9 @@ llvm::StringRef Target; struct Element { llvm::StringRef Symbol; // Name of the symbol + // Attributes that are associated to the sequence symbol or rule. + std::vector<std::pair<AttributeKey, llvm::StringRef/*Value*/>> + Attributes; }; std::vector<Element> Sequence; @@ -204,11 +251,46 @@ Chunk = Chunk.trim(); if (Chunk.empty()) continue; // skip empty + if (Chunk.startswith("[") && Chunk.endswith("]")) { + if (Out.Sequence.empty()) { + Diagnostics.push_back( + "Attributes are not allowed at the beginning of the rule"); + continue; + } + parseAttribute(Chunk, Out.Sequence.back().Attributes); + continue; + } Out.Sequence.push_back({Chunk}); } return true; - }; + } + + bool parseAttribute( + llvm::StringRef Content, + std::vector<std::pair<AttributeKey, llvm::StringRef>> &Out) { + assert(Content.startswith("[") && Content.endswith("]")); + for (llvm::StringRef Attr : + llvm::split(Content.drop_front().drop_back(), ';')) { + auto KeyAndValue = Attr.split('='); + if (KeyAndValue.first == Attr) { // no separator in Line + Diagnostics.push_back( + llvm::formatv("Failed to parse attribute '{0}': no separator =", + Attr) + .str()); + return false; + } + if (KeyAndValue.first == "guard") { + Out.push_back({AttributeKey::Guard, KeyAndValue.second.trim()}); + continue; + } + + Diagnostics.push_back( + llvm::formatv("Unknown attribute key '{0}'", KeyAndValue.first) + .str()); + } + return true; + } // Inlines all _opt symbols. // For example, a rule E := id +_opt id, after elimination, we have two Index: clang-tools-extra/pseudo/lib/grammar/Grammar.cpp =================================================================== --- clang-tools-extra/pseudo/lib/grammar/Grammar.cpp +++ clang-tools-extra/pseudo/lib/grammar/Grammar.cpp @@ -17,8 +17,10 @@ namespace clang { namespace pseudo { -Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence) - : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) { +Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence, + AttributeID Guard) + : Target(Target), Size(static_cast<uint8_t>(Sequence.size())), + Guard(Guard) { assert(Sequence.size() <= Rule::MaxElements); llvm::copy(Sequence, this->Sequence); } @@ -61,6 +63,8 @@ OS << symbolName(R.Target) << " :="; for (SymbolID SID : R.seq()) OS << " " << symbolName(SID); + if (R.Guard) + OS << " [guard=" << T->Attributes[R.Guard] << "]"; return Result; } Index: clang-tools-extra/pseudo/lib/cxx/CXX.cpp =================================================================== --- clang-tools-extra/pseudo/lib/cxx/CXX.cpp +++ clang-tools-extra/pseudo/lib/cxx/CXX.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "clang-pseudo/cxx/CXX.h" +#include "clang-pseudo/Forest.h" +#include "clang-pseudo/Grammar.h" #include "clang-pseudo/LRTable.h" +#include "clang/Basic/TokenKinds.h" namespace clang { namespace pseudo { @@ -29,6 +32,23 @@ return *Table; } +bool guard(RuleID RID, llvm::ArrayRef<const ForestNode *> RHS, + const TokenStream &Tokens, const Grammar &G) { + auto &Rule = G.lookupRule(RID); + assert(Rule.Guard != NoneAttribute); + if (Rule.Guard == static_cast<AttributeID>(Atrribute::Override)) { + assert(RHS.size() == 1 && + RHS.front()->symbol() == tokenSymbol(clang::tok::identifier)); + return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "override"; + } + if (Rule.Guard == static_cast<AttributeID>(Atrribute::Final)) { + assert(RHS.size() == 1 && + RHS.front()->symbol() == tokenSymbol(clang::tok::identifier)); + return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "final"; + } + return true; +} + } // namespace cxx } // namespace pseudo } // namespace clang Index: clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt =================================================================== --- clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt +++ clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt @@ -6,4 +6,5 @@ LINK_LIBS clangPseudoGrammar + clangPseudo ) Index: clang-tools-extra/pseudo/lib/cxx.bnf =================================================================== --- clang-tools-extra/pseudo/lib/cxx.bnf +++ clang-tools-extra/pseudo/lib/cxx.bnf @@ -739,8 +739,8 @@ #! Contextual keywords -- clang lexer always lexes them as identifier tokens. #! Placeholders for literal text in the grammar that lex as other things. -contextual-override := IDENTIFIER -contextual-final := IDENTIFIER +contextual-override := IDENTIFIER [guard=Override] +contextual-final := IDENTIFIER [guard=Final] contextual-zero := NUMERIC_CONSTANT module-keyword := IDENTIFIER import-keyword := IDENTIFIER Index: clang-tools-extra/pseudo/lib/GLR.cpp =================================================================== --- clang-tools-extra/pseudo/lib/GLR.cpp +++ clang-tools-extra/pseudo/lib/GLR.cpp @@ -72,11 +72,10 @@ for (const auto *Head : NewHeads) AddSteps(Head, Terminal.symbol()); NewHeads.clear(); - glrReduce(PendingReduce, Params, - [&](const GSS::Node * NewHead) { - // A reduce will enable more steps. - AddSteps(NewHead, Terminal.symbol()); - }); + glrReduce(PendingReduce, Tokens, Params, [&](const GSS::Node *NewHead) { + // A reduce will enable more steps. + AddSteps(NewHead, Terminal.symbol()); + }); glrShift(PendingShift, Terminal, Params, [&](const GSS::Node *NewHead) { NewHeads.push_back(NewHead); }); @@ -84,11 +83,10 @@ LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next is eof\n")); for (const auto *Heads : NewHeads) AddSteps(Heads, tokenSymbol(tok::eof)); - glrReduce(PendingReduce, Params, - [&](const GSS::Node * NewHead) { - // A reduce will enable more steps. - AddSteps(NewHead, tokenSymbol(tok::eof)); - }); + glrReduce(PendingReduce, Tokens, Params, [&](const GSS::Node *NewHead) { + // A reduce will enable more steps. + AddSteps(NewHead, tokenSymbol(tok::eof)); + }); if (!PendingAccept.empty()) { LLVM_DEBUG({ @@ -211,8 +209,8 @@ // After reducing 3 by `pointer := class-name STAR` and // 2 by`enum-name := class-name STAR`: // 0--5(pointer) // 5 is goto(0, pointer) -void glrReduce(std::vector<ParseStep> &PendingReduce, const ParseParams &Params, - NewHeadCallback NewHeadCB) { +void glrReduce(std::vector<ParseStep> &PendingReduce, const TokenStream &Tokens, + const ParseParams &Params, NewHeadCallback NewHeadCB) { // There are two interacting complications: // 1. Performing one reduce can unlock new reduces on the newly-created head. // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes). @@ -282,6 +280,10 @@ TempSequence.resize_for_overwrite(Rule.Size); auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) { if (I == Rule.Size) { + if (Rule.Guard && Params.Guard && + !Params.Guard(RID, TempSequence, Tokens, Params.G)) + return; + F.Start = TempSequence.front()->startTokenIndex(); Bases.emplace(F, N); LLVM_DEBUG(llvm::dbgs() << " --> base at S" << N->State << "\n"); Index: clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h =================================================================== --- clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h +++ clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h @@ -28,13 +28,25 @@ namespace clang { namespace pseudo { class LRTable; +class ForestNode; +class TokenStream; namespace cxx { // Symbol represents nonterminal symbols in the C++ grammar. // It provides a simple uniform way to access a particular nonterminal. enum class Symbol : SymbolID { #define NONTERMINAL(X, Y) X = Y, +#define ATTRIBUTE(X, Y) #include "CXXSymbols.inc" +#undef ATTRIBUTE +#undef NONTERMINAL +}; + +enum class Atrribute : AttributeID { +#define NONTERMINAL(X, Y) +#define ATTRIBUTE(X, Y) X = Y, +#include "CXXSymbols.inc" +#undef ATTRIBUTE #undef NONTERMINAL }; @@ -43,6 +55,14 @@ // Returns the corresponding LRTable for the C++ grammar. const LRTable &getLRTable(); +// Implements a C++ specific "guard" attribute. +// It determines whether a reduction of a rule is performed in the GLR parser. +// +// Returns false if the reduction is not performed (this parsing branch in GLR +// will die). +bool guard(RuleID RID, llvm::ArrayRef<const ForestNode *> RHS, + const TokenStream &Tokens, const Grammar &G); + } // namespace cxx } // namespace pseudo Index: clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h =================================================================== --- clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h +++ clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h @@ -19,6 +19,19 @@ // production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either // nonterminal or terminal, identified by a SymbolID. // +// Attributes have the syntax form of [guard=value;key=value], they are +// associated with a grammar symbol (on the right-hand side of the symbol) or +// a grammar rule (at the end of the rule body). +// +// Attributes provide a way to inject custom code into the general GLR +// parser. For example, we have a rule in the C++ grammar: +// +// contextual-override := IDENTIFIER [guard=Override] +// +// This rule is guarded -- the reduction of this rule will be conducted by the +// GLR parser only if the IDENTIFIER content is `override` (see guard +// implementationin CXX.h). +// // Notions about the BNF grammar: // - "_" is the start symbol of the augmented grammar; // - single-line comment is supported, starting with a # @@ -69,6 +82,20 @@ return TokenFlag | static_cast<SymbolID>(TK); } +// An attribute is specified in the grammar like [guard=value]. +// Defines the built-in attribute keys. +enum class AttributeKey : uint8_t { + // A guard controls whether a reduction of a rule will be conducted by the GLR + // parser. + Guard, +}; +// An AttributeID uniquely identifies an attribute in a grammar. +// It is the index into a table of attribute values. +// NOTE: value among attributes must be unique even within different keys! +using AttributeID = uint16_t; +// A sentinel attributeID indicating no attributes, by default. +static constexpr AttributeID NoneAttribute = 0; + // A RuleID uniquely identifies a production rule in a grammar. // It is an index into a table of rules. using RuleID = uint16_t; @@ -79,7 +106,8 @@ // expression := a b c // ^Target ^Sequence struct Rule { - Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq); + Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq, + AttributeID Guard = NoneAttribute); // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens // long, however, we're stricter in order to reduce the size, we limit the max @@ -96,11 +124,13 @@ uint8_t Size : SizeBits; // Size of the Sequence SymbolID Sequence[MaxElements]; + AttributeID Guard = NoneAttribute; + llvm::ArrayRef<SymbolID> seq() const { return llvm::ArrayRef<SymbolID>(Sequence, Size); } friend bool operator==(const Rule &L, const Rule &R) { - return L.Target == R.Target && L.seq() == R.seq(); + return L.Target == R.Target && L.seq() == R.seq() && L.Guard == R.Guard; } }; @@ -186,6 +216,9 @@ // A table of nonterminals, sorted by name. // SymbolID is the index of the table. std::vector<Nonterminal> Nonterminals; + // A table of attribute values, sorted by name. + // AttributeID is the index of the table. + std::vector<std::string> Attributes; }; } // namespace pseudo Index: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h =================================================================== --- clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -110,6 +110,10 @@ }; llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &); +using ReduceGuard = + std::function<bool(RuleID RID, llvm::ArrayRef<const ForestNode *> RHS, + const TokenStream &Tokens, const Grammar &G)>; + // Parameters for the GLR parsing. struct ParseParams { // The grammar of the language we're going to parse. @@ -121,6 +125,8 @@ // Arena for data structure used by the GLR algorithm. ForestArena &Forest; // Storage for the output forest. GSS &GSStack; // Storage for parsing stacks. + + ReduceGuard Guard; // Guard for reducing. }; // Parses the given token stream as the start symbol with the GLR algorithm, // and returns a forest node of the start symbol. @@ -157,8 +163,8 @@ // add elements to PendingReduce // // Exposed for testing only. -void glrReduce(std::vector<ParseStep> &PendingReduce, const ParseParams &Params, - NewHeadCallback NewHeadCB); +void glrReduce(std::vector<ParseStep> &PendingReduce, const TokenStream &Tokens, + const ParseParams &Params, NewHeadCallback NewHeadCB); } // namespace pseudo } // namespace clang Index: clang-tools-extra/pseudo/gen/Main.cpp =================================================================== --- clang-tools-extra/pseudo/gen/Main.cpp +++ clang-tools-extra/pseudo/gen/Main.cpp @@ -86,6 +86,12 @@ std::replace(Name.begin(), Name.end(), '-', '_'); Out.os() << llvm::formatv("NONTERMINAL({0}, {1})\n", Name, ID); } + for (clang::pseudo::AttributeID AID = 1 /*skip the sentinel 0 value*/; + AID < G->table().Attributes.size(); ++AID) { + llvm::StringRef Name = G->table().Attributes[AID]; + assert(!Name.empty()); + Out.os() << llvm::formatv("ATTRIBUTE({0}, {1})\n", Name, AID); + } break; case EmitGrammarContent: for (llvm::StringRef Line : llvm::split(GrammarText, '\n')) {
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits