ilya-biryukov updated this revision to Diff 195425. ilya-biryukov added a comment.
- Simplify rawByExpanded by using a helper function. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D59887/new/ https://reviews.llvm.org/D59887 Files: clang/include/clang/Tooling/Syntax/Tokens.h clang/lib/Tooling/CMakeLists.txt clang/lib/Tooling/Syntax/CMakeLists.txt clang/lib/Tooling/Syntax/Tokens.cpp clang/unittests/Tooling/CMakeLists.txt clang/unittests/Tooling/Syntax/CMakeLists.txt clang/unittests/Tooling/Syntax/TokensTest.cpp
Index: clang/unittests/Tooling/Syntax/TokensTest.cpp =================================================================== --- /dev/null +++ clang/unittests/Tooling/Syntax/TokensTest.cpp @@ -0,0 +1,629 @@ +//===- TokensTest.cpp -----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Tokens.h" +#include "clang/AST/ASTConsumer.h" +#include "clang/AST/Expr.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticIDs.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/FileSystemOptions.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.def" +#include "clang/Basic/TokenKinds.h" +#include "clang/Frontend/CompilerInstance.h" +#include "clang/Frontend/FrontendAction.h" +#include "clang/Frontend/Utils.h" +#include "clang/Lex/Lexer.h" +#include "clang/Lex/PreprocessorOptions.h" +#include "clang/Lex/Token.h" +#include "clang/Tooling/Tooling.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/IntrusiveRefCntPtr.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Support/raw_os_ostream.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Testing/Support/Annotations.h" +#include "gmock/gmock-more-matchers.h" +#include <cassert> +#include <cstdlib> +#include <gmock/gmock.h> +#include <gtest/gtest.h> +#include <memory> +#include <ostream> +#include <string> + +using namespace clang; +using namespace clang::syntax; + +using ::testing::AllOf; +using ::testing::Contains; +using ::testing::ElementsAre; +using ::testing::Matcher; +using ::testing::Pointwise; + +namespace { +// Matchers for syntax::Token. +MATCHER_P(Kind, K, "") { return arg.kind() == K; } +MATCHER_P2(HasText, Text, SourceMgr, "") { + return arg.text(*SourceMgr) == Text; +} +MATCHER_P2(IsIdent, Text, SourceMgr, "") { + return arg.kind() == tok::identifier && arg.text(*SourceMgr) == Text; +} +/// Checks the start and end location of a token are equal to SourceRng. +MATCHER_P(RangeIs, SourceRng, "") { + return arg.location() == SourceRng.first && + arg.endLocation() == SourceRng.second; +} +/// Checks the passed tuple has two similar tokens, i.e. both are of the same +/// kind and have the same text if they are identifiers. +/// Ignores differences in kind between the raw and non-raw mode. +MATCHER_P(IsSameToken, SourceMgr, "") { + auto ToEquivalenceClass = [](tok::TokenKind Kind) { + if (Kind == tok::identifier || Kind == tok::raw_identifier || + tok::getKeywordSpelling(Kind) != nullptr) + return tok::identifier; + if (Kind == tok::string_literal || Kind == tok::header_name) + return tok::string_literal; + return Kind; + }; + + auto &L = std::get<0>(arg); + auto &R = std::get<1>(arg); + if (ToEquivalenceClass(L.kind()) != ToEquivalenceClass(R.kind())) + return false; + return L.text(*SourceMgr) == L.text(*SourceMgr); +} +} // namespace + +// Actual test fixture lives in the syntax namespace as it's a friend of +// TokenBuffer. +class syntax::TokensTest : public ::testing::Test { +public: + /// Run the clang frontend, collect the preprocessed tokens from the frontend + /// invocation and store them in this->Buffer. + /// This also clears SourceManager before running the compiler. + void recordTokens(llvm::StringRef Code) { + class RecordTokens : public ASTFrontendAction { + public: + explicit RecordTokens(TokenBuffer &Result) : Result(Result) {} + + bool BeginSourceFileAction(CompilerInstance &CI) override { + assert(!Collector && "expected only a single call to BeginSourceFile"); + Collector.emplace(CI.getPreprocessor()); + return true; + } + void EndSourceFileAction() override { + assert(Collector && "BeginSourceFileAction was never called"); + Result = std::move(*Collector).consume(); + } + + std::unique_ptr<ASTConsumer> + CreateASTConsumer(CompilerInstance &CI, StringRef InFile) override { + return llvm::make_unique<ASTConsumer>(); + } + + private: + TokenBuffer &Result; + llvm::Optional<TokenCollector> Collector; + }; + + constexpr const char *FileName = "./input.cpp"; + FS->addFile(FileName, time_t(), llvm::MemoryBuffer::getMemBufferCopy("")); + // Prepare to run a compiler. + std::vector<const char *> Args = {"tok-test", "-std=c++03", "-fsyntax-only", + FileName}; + auto CI = createInvocationFromCommandLine(Args, Diags, FS); + assert(CI); + CI->getFrontendOpts().DisableFree = false; + CI->getPreprocessorOpts().addRemappedFile( + FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release()); + LangOpts = *CI->getLangOpts(); + CompilerInstance Compiler; + Compiler.setInvocation(std::move(CI)); + if (!Diags->getClient()) + Diags->setClient(new IgnoringDiagConsumer); + Compiler.setDiagnostics(Diags.get()); + Compiler.setFileManager(FileMgr.get()); + Compiler.setSourceManager(SourceMgr.get()); + + this->Buffer = TokenBuffer(*SourceMgr); + RecordTokens Recorder(this->Buffer); + ASSERT_TRUE(Compiler.ExecuteAction(Recorder)) + << "failed to run the frontend"; + + DEBUG_WITH_TYPE("syntax-tokens-test", { + llvm::dbgs() << "=== Recorded token stream:\n"; + this->Buffer.dump(llvm::dbgs()); + }); + } + + /// Run syntax::tokenize() and return the results. + std::vector<syntax::Token> tokenize(llvm::StringRef Text) { + // Null-terminate so that we always see 'tok::eof' at the end. + std::string NullTerminated = Text.str(); + auto FID = SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy( + StringRef(NullTerminated.data(), NullTerminated.size() + 1))); + return syntax::tokenize(FID, *SourceMgr, LangOpts); + } + + /// Checks that lexing \p ExpectedText in raw mode would produce the same + /// token stream as the one stored in this->Buffer.expandedTokens(). + void expectTokens(llvm::StringRef ExpectedText) { + std::vector<syntax::Token> ExpectedTokens = tokenize(ExpectedText); + EXPECT_THAT(std::vector<syntax::Token>(Buffer.expandedTokens()), + Pointwise(IsSameToken(), ExpectedTokens)) + << "\texpected tokens: " << ExpectedText; + } + + void expectSameTokens(llvm::ArrayRef<syntax::Token> Actual, + llvm::ArrayRef<syntax::Token> Expected) { + EXPECT_THAT(std::vector<syntax::Token>(Actual), + Pointwise(IsSameToken(), std::vector<syntax::Token>(Expected))); + } + + struct ExpectedInvocation { + ExpectedInvocation( + std::string From, std::string To, + llvm::Optional<llvm::Annotations::Range> Range = llvm::None) + : From(std::move(From)), To(std::move(To)), Range(Range) {} + /// A textual representation of the macro tokens. + std::string From; + /// A textual representation of the tokens after macro replacement. + std::string To; + /// A text range the macro invocation in the source code. + llvm::Optional<llvm::Annotations::Range> Range; + }; + + // FIXME: use a vocabulary range type instead. + std::pair<unsigned, unsigned> + mappingTextRange(const TokenBuffer::Mapping &M, + const TokenBuffer::MarkedFile &F) { + assert(M.BeginRawToken < M.EndRawToken && "Invalid mapping"); + return { + SourceMgr->getFileOffset(F.RawTokens.at(M.BeginRawToken).location()), + SourceMgr->getFileOffset( + F.RawTokens.at(M.EndRawToken - 1).endLocation())}; + } + + FileID findFile(llvm::StringRef Name) const { + const FileEntry *Entry = FileMgr->getFile(Name); + FileID Found = SourceMgr->translateFile(Entry); + if (!Found.isValid()) { + ADD_FAILURE() << "SourceManager does not track " << Name; + std::abort(); + } + return Found; + } + /// Checks the this->Buffer.macroInvocations() for the main file match the \p + /// Expected ones. + void expectMacroInvocations(llvm::ArrayRef<ExpectedInvocation> Expected, + FileID FID = FileID()) { + if (!FID.isValid()) + FID = SourceMgr->getMainFileID(); + EXPECT_TRUE(Buffer.Files.count(FID)) << "tokens for file were not recorded"; + TokenBuffer::MarkedFile &File = Buffer.Files[FID]; + + llvm::ArrayRef<TokenBuffer::Mapping> Actual = File.Mappings; + ASSERT_EQ(Actual.size(), Expected.size()); + + for (unsigned I = 0; I < Actual.size(); ++I) { + const auto &A = Actual[I]; + const auto &E = Expected[I]; + + if (E.Range) + ASSERT_EQ(mappingTextRange(A, File), + (std::pair<unsigned, unsigned>(E.Range->Begin, E.Range->End))) + << "\trange does not match"; + + auto DropEOF = [](std::vector<syntax::Token> Tokens) { + if (Tokens.empty() || Tokens.back().kind() != tok::eof) { + ADD_FAILURE() << "expected 'eof' at the end of the tokens"; + return Tokens; + } + Tokens.pop_back(); + return Tokens; + }; + + std::vector<syntax::Token> ActualRaw( + File.RawTokens.begin() + A.BeginRawToken, + File.RawTokens.begin() + A.EndRawToken); + ASSERT_THAT(ActualRaw, + Pointwise(IsSameToken(), DropEOF(tokenize(E.From)))) + << "\tmacro tokens do not match, expected " << E.From; + + std::vector<syntax::Token> ActualExpanded( + Buffer.ExpandedTokens.begin() + A.BeginExpandedToken, + Buffer.ExpandedTokens.begin() + A.EndExpandedToken); + ASSERT_THAT(ActualExpanded, + Pointwise(IsSameToken(), DropEOF(tokenize(E.To)))) + << "\ttokens after macro replacements do not match, expected " + << E.To; + } + } + + // Specialized versions of matchers that rely on SourceManager. + Matcher<syntax::Token> IsIdent(std::string Text) const { + return ::IsIdent(Text, SourceMgr.get()); + } + Matcher<syntax::Token> HasText(std::string Text) const { + return ::HasText(Text, SourceMgr.get()); + } + Matcher<syntax::Token> RangeIs(llvm::Annotations::Range R) const { + std::pair<SourceLocation, SourceLocation> Ls; + Ls.first = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID()) + .getLocWithOffset(R.Begin); + Ls.second = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID()) + .getLocWithOffset(R.End); + return ::RangeIs(Ls); + } + + Matcher<std::tuple<const syntax::Token &, const syntax::Token &>> + IsSameToken() const { + return ::IsSameToken(SourceMgr.get()); + } + + void addFile(llvm::StringRef Path, llvm::StringRef Contents) { + if (!FS->addFile(Path, time_t(), + llvm::MemoryBuffer::getMemBufferCopy(Contents))) { + ADD_FAILURE() << "could not add a file to VFS: " << Path; + } + } + + // Data fields. + llvm::IntrusiveRefCntPtr<DiagnosticsEngine> Diags = + new DiagnosticsEngine(new DiagnosticIDs, new DiagnosticOptions); + IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS = + new llvm::vfs::InMemoryFileSystem; + llvm::IntrusiveRefCntPtr<FileManager> FileMgr = + new FileManager(FileSystemOptions(), FS); + llvm::IntrusiveRefCntPtr<SourceManager> SourceMgr = + new SourceManager(*Diags, *FileMgr); + /// Contains last result of calling recordTokens(). + TokenBuffer Buffer = TokenBuffer(*SourceMgr); + /// Contains options from last run of recordTokens(). + LangOptions LangOpts; +}; + +namespace { +TEST_F(TokensTest, RawMode) { + EXPECT_THAT(tokenize("int main() {}"), + ElementsAre(Kind(tok::kw_int), IsIdent("main"), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace), Kind(tok::r_brace), + Kind(tok::eof))); + // Comments are ignored for now. + EXPECT_THAT(tokenize("/* foo */int a; // more comments"), + ElementsAre(Kind(tok::kw_int), IsIdent("a"), Kind(tok::semi), + Kind(tok::eof))); +} + +TEST_F(TokensTest, Basic) { + recordTokens("int main() {}"); + EXPECT_THAT(Buffer.expandedTokens(), + ElementsAre(Kind(tok::kw_int), IsIdent("main"), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace), Kind(tok::r_brace), + Kind(tok::eof))); + // All kinds of whitespace are ignored. + recordTokens("\t\n int\t\n main\t\n (\t\n )\t\n{\t\n }\t\n"); + EXPECT_THAT(Buffer.expandedTokens(), + ElementsAre(Kind(tok::kw_int), IsIdent("main"), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace), Kind(tok::r_brace), + Kind(tok::eof))); + + llvm::Annotations Code(R"cpp( + $r1[[int]] $r2[[a]] $r3[[=]] $r4[["foo bar baz"]] $r5[[;]] + )cpp"); + recordTokens(Code.code()); + EXPECT_THAT( + Buffer.expandedTokens(), + ElementsAre(AllOf(Kind(tok::kw_int), RangeIs(Code.range("r1"))), + AllOf(Kind(tok::identifier), RangeIs(Code.range("r2"))), + AllOf(Kind(tok::equal), RangeIs(Code.range("r3"))), + AllOf(Kind(tok::string_literal), RangeIs(Code.range("r4"))), + AllOf(Kind(tok::semi), RangeIs(Code.range("r5"))), + Kind(tok::eof))); +} + +TEST_F(TokensTest, MacroDirectives) { + // Macro directives are not stored anywhere at the moment. + llvm::StringLiteral Code = R"cpp( + #define FOO a + #include "unresolved_file.h" + #undef FOO + #ifdef X + #else + #endif + #ifndef Y + #endif + #if 1 + #elif 2 + #else + #endif + #pragma once + #pragma something lalala + + int a; + )cpp"; + recordTokens(Code); + + expectTokens("int a;"); + expectMacroInvocations({}); + + expectSameTokens(Buffer.rawTokens(SourceMgr->getMainFileID()), + tokenize(Code)); +} + +TEST_F(TokensTest, MacroReplacements) { + // A simple object-like macro. + llvm::Annotations Code(R"cpp( + #define INT int const + [[INT]] a; + )cpp"); + recordTokens(Code.code()); + + expectTokens("int const a;"); + expectMacroInvocations({{"INT", "int const", Code.range()}}); + + // A simple function-like macro. + Code = llvm::Annotations(R"cpp( + #define INT(a) const int + [[INT(10+10)]] a; + )cpp"); + recordTokens(Code.code()); + + expectTokens("const int a;"); + expectMacroInvocations({{"INT(10+10)", "const int", Code.range()}}); + + // Recursive macro replacements. + Code = llvm::Annotations(R"cpp( + #define ID(X) X + #define INT int const + [[ID(ID(INT))]] a; + )cpp"); + recordTokens(Code.code()); + + expectTokens("int const a;"); + expectMacroInvocations({{"ID(ID(INT))", "int const", Code.range()}}); + + // A little more complicated recursive macro replacements. + Code = llvm::Annotations(R"cpp( + #define ADD(X, Y) X+Y + #define MULT(X, Y) X*Y + + int a = [[ADD(MULT(1,2), MULT(3,ADD(4,5)))]]; + )cpp"); + recordTokens(Code.code()); + + expectTokens("int a = 1*2+3*4+5;"); + expectMacroInvocations( + {{"ADD(MULT(1,2), MULT(3,ADD(4,5)))", "1*2+3*4+5", Code.range()}}); + + // Empty macro replacement. + Code = llvm::Annotations(R"cpp( + #define EMPTY + #define EMPTY_FUNC(X) + $m[[EMPTY]] + $f[[EMPTY_FUNC(1+2+3)]] + )cpp"); + recordTokens(Code.code()); + + expectTokens(""); + expectMacroInvocations({{"EMPTY", "", Code.range("m")}, + {"EMPTY_FUNC(1+2+3)", "", Code.range("f")}}); + + // File ends with a macro replacement. + Code = llvm::Annotations(R"cpp( + #define FOO 10+10; + int a = [[FOO]])cpp"); + recordTokens(Code.code()); + + expectTokens("int a = 10+10;"); + expectMacroInvocations({{"FOO", "10+10;", Code.range()}}); +} + +TEST_F(TokensTest, SpecialTokens) { + // Tokens coming from concatenations. + recordTokens(R"cpp( + #define CONCAT(a, b) a ## b + int a = CONCAT(1, 2); + )cpp"); + expectTokens("int a = 12;"); + // Multi-line tokens with slashes at the end. + recordTokens("i\\\nn\\\nt"); + EXPECT_THAT(Buffer.expandedTokens(), + ElementsAre(AllOf(Kind(tok::kw_int), HasText("i\\\nn\\\nt")), + Kind(tok::eof))); + // FIXME: test tokens with digraphs and UCN identifiers. +} + +TEST_F(TokensTest, LateBoundTokens) { + // The parser eventually breaks the first '>>' into two tokens ('>' and '>'), + // but we choose to record them as a single token (for now). + llvm::Annotations Code(R"cpp( + template <class T> + struct foo { int a; }; + int bar = foo<foo<int$br[[>>]]().a; + int baz = 10 $op[[>>]] 2; + )cpp"); + recordTokens(Code.code()); + EXPECT_THAT(std::vector<syntax::Token>(Buffer.expandedTokens()), + AllOf(Contains(AllOf(Kind(tok::greatergreater), + RangeIs(Code.range("br")))), + Contains(AllOf(Kind(tok::greatergreater), + RangeIs(Code.range("op")))))); +} + +TEST_F(TokensTest, DelayedParsing) { + llvm::StringLiteral Code = R"cpp( + struct Foo { + int method() { + // Parser will visit method bodies and initializers multiple times, but + // TokenBuffer should only record the first walk over the tokens; + return 100; + } + int a = 10; + int b = 20; + + struct Subclass { + void foo() { + Foo().method(); + } + }; + }; + )cpp"; + recordTokens(Code); + // Checks that lexing in raw mode produces the same results, hence we're not + // recording any tokens twice and the order is the same. + expectTokens(Code); +} + +TEST_F(TokensTest, Offsets) { + llvm::Annotations Code(""); + /// Finds a token with the specified text. + auto Find = [this](llvm::StringRef Text) { + llvm::ArrayRef<syntax::Token> Tokens = Buffer.expandedTokens(); + auto TokenMatches = [=](const syntax::Token &T) { + return T.text(*SourceMgr) == Text; + }; + auto It = llvm::find_if(Tokens, TokenMatches); + if (It == Tokens.end()) { + ADD_FAILURE() << "could not find the token for " << Text; + std::abort(); + } + if (std::find_if(std::next(It), Tokens.end(), TokenMatches) != + Tokens.end()) { + ADD_FAILURE() << "token is not unique: " << Text; + std::abort(); + }; + return It; + }; + auto Range = [&](llvm::StringRef Name) { + auto R = Code.range(Name); + syntax::FileRange FR; + FR.File = SourceMgr->getMainFileID(); + FR.Begin = R.Begin; + FR.End = R.End; + return FR; + }; + + Code = llvm::Annotations(R"cpp( + $all[[$first[[a1 a2 a3]] FIRST $second[[b1 b2]] LAST]] + )cpp"); + + recordTokens(Code.code()); + EXPECT_EQ(Buffer.findOffsetsByExpanded( + llvm::makeArrayRef(Find("a1"), std::next(Find("LAST")))), + Range("all")); + EXPECT_EQ(Buffer.findOffsetsByExpanded( + llvm::makeArrayRef(Find("a1"), Find("FIRST"))), + Range("first")); + EXPECT_EQ(Buffer.findOffsetsByExpanded( + llvm::makeArrayRef(Find("b1"), Find("LAST"))), + Range("second")); + + Code = llvm::Annotations(R"cpp( + #define A a1 a2 a3 + #define B b1 b2 + + $all[[$first[[A]] FIRST $second[[B]] LAST]] + )cpp"); + recordTokens(Code.code()); + + EXPECT_EQ(Buffer.findOffsetsByExpanded( + llvm::makeArrayRef(Find("a1"), std::next(Find("LAST")))), + Range("all")); + EXPECT_EQ(Buffer.findOffsetsByExpanded( + llvm::makeArrayRef(Find("a1"), Find("FIRST"))), + Range("first")); + EXPECT_EQ(Buffer.findOffsetsByExpanded( + llvm::makeArrayRef(Find("b1"), Find("LAST"))), + Range("second")); + // Ranges not fully covering macro invocations should fail. + EXPECT_EQ( + Buffer.findOffsetsByExpanded(llvm::makeArrayRef(Find("a1"), Find("a3"))), + llvm::None); + EXPECT_EQ(Buffer.findOffsetsByExpanded( + llvm::makeArrayRef(Find("b2"), Find("LAST"))), + llvm::None); + EXPECT_EQ( + Buffer.findOffsetsByExpanded(llvm::makeArrayRef(Find("a2"), Find("b2"))), + llvm::None); + + Code = llvm::Annotations(R"cpp( + #define ID(x) x + #define B b1 b2 + + $both[[$first[[ID(ID(ID(a1) a2 a3))]] FIRST $second[[ID(B)]]]] LAST + )cpp"); + recordTokens(Code.code()); + + EXPECT_EQ(Buffer.findOffsetsByExpanded( + llvm::makeArrayRef(Find("a1"), Find("FIRST"))), + Range("first")); + EXPECT_EQ(Buffer.findOffsetsByExpanded( + llvm::makeArrayRef(Find("b1"), Find("LAST"))), + Range("second")); + EXPECT_EQ(Buffer.findOffsetsByExpanded( + llvm::makeArrayRef(Find("a1"), Find("LAST"))), + Range("both")); + + // Ranges crossing macro call boundaries. + EXPECT_EQ( + Buffer.findOffsetsByExpanded(llvm::makeArrayRef(Find("a1"), Find("b2"))), + llvm::None); + EXPECT_EQ( + Buffer.findOffsetsByExpanded(llvm::makeArrayRef(Find("a2"), Find("b2"))), + llvm::None); + // FIXME: next two examples should map to macro arguments, but currently they + // fail. + EXPECT_EQ( + Buffer.findOffsetsByExpanded(llvm::makeArrayRef(Find("a2"), Find("a3"))), + llvm::None); + EXPECT_EQ( + Buffer.findOffsetsByExpanded(llvm::makeArrayRef(Find("a1"), Find("a3"))), + llvm::None); +} + +TEST_F(TokensTest, MultiFile) { + addFile("./foo.h", R"cpp( + #define ADD(X, Y) X+Y + int a = 100; + #include "bar.h" + )cpp"); + addFile("./bar.h", R"cpp( + int b = ADD(1, 2); + #define MULT(X, Y) X*Y + )cpp"); + recordTokens(R"cpp( + #include "foo.h" + int c = ADD(1, MULT(2,3)); + )cpp"); + + expectTokens(R"cpp( + int a = 100; + int b = 1+2; + int c = 1+2*3; + )cpp"); + expectMacroInvocations({{"ADD(1,MULT(2,3))", "1+2*3"}}); + expectMacroInvocations({{}}, findFile("./foo.h")); + expectMacroInvocations({{"ADD(1,2)", "1+2"}}, findFile("./bar.h")); +} +} // namespace Index: clang/unittests/Tooling/Syntax/CMakeLists.txt =================================================================== --- /dev/null +++ clang/unittests/Tooling/Syntax/CMakeLists.txt @@ -0,0 +1,20 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Support + ) + +add_clang_unittest(TokensTest + TokensTest.cpp +) + +target_link_libraries(TokensTest + PRIVATE + clangAST + clangBasic + clangFrontend + clangLex + clangSerialization + clangTooling + clangToolingSyntax + LLVMTestingSupport + ) Index: clang/unittests/Tooling/CMakeLists.txt =================================================================== --- clang/unittests/Tooling/CMakeLists.txt +++ clang/unittests/Tooling/CMakeLists.txt @@ -69,3 +69,6 @@ clangToolingInclusions clangToolingRefactor ) + + +add_subdirectory(Syntax) Index: clang/lib/Tooling/Syntax/Tokens.cpp =================================================================== --- /dev/null +++ clang/lib/Tooling/Syntax/Tokens.cpp @@ -0,0 +1,369 @@ +//===- Tokens.cpp - collect tokens from preprocessing ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "clang/Tooling/Syntax/Tokens.h" + +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.def" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <iterator> +#include <utility> +#include <vector> + +using namespace clang; +using namespace clang::syntax; + +syntax::Token::Token(const clang::Token &T) + : Token(T.getLocation(), T.getLength(), T.getKind()) { + assert(!T.isAnnotation()); +} + +llvm::StringRef syntax::Token::text(const SourceManager &SM) const { + bool Invalid = false; + const char *Start = SM.getCharacterData(location(), &Invalid); + assert(!Invalid); + return llvm::StringRef(Start, length()); +} + +std::string syntax::Token::str() const { + return llvm::formatv("Token({0}, length = {1})", tok::getTokenName(kind()), + length()); +} + +std::string syntax::Token::str(const SourceManager &SM) const { + return llvm::formatv("Token({0}, length = {1}, location = {2}, text = {3})", + tok::getTokenName(kind()), length(), + location().printToString(SM), text(SM)); +} + +llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, const Token &T) { + return OS << T.str(); +} + +llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, + const FileRange &R) { + return OS << llvm::formatv("FileRange(file = {0}, offsets = {1}-{2})", + R.File.getHashValue(), R.Begin, R.End); +} + +void TokenBuffer::dump(llvm::raw_ostream &OS) const { + OS << "expanded tokens:\n"; + for (unsigned I = 0; I < ExpandedTokens.size(); ++I) + OS << llvm::formatv(" {0}: {1}\n", I, ExpandedTokens[I].str(*SourceMgr)); + + std::vector<FileID> Keys; + for (auto F : Files) + Keys.push_back(F.first); + llvm::sort(Keys); + + for (FileID ID : Keys) { + const MarkedFile &File = Files.find(ID)->second; + + auto *Entry = SourceMgr->getFileEntryForID(ID); + OS << " file " << (Entry ? Entry->getName() : "<<virtual file>>") << "\n"; + OS << " raw tokens:\n"; + for (unsigned I = 0; I < File.RawTokens.size(); ++I) + OS << llvm::formatv(" {0}: {1}\n", I, + File.RawTokens[I].str(*SourceMgr)); + OS << " mappings:\n"; + for (auto &M : File.Mappings) + OS << " " << M.str() << "\n"; + } +} + +std::pair<const syntax::Token *, const TokenBuffer::Mapping *> +TokenBuffer::expandedToRaw(const syntax::Token *Expanded) const { + assert(Expanded); + assert(ExpandedTokens.data() <= Expanded && + Expanded < ExpandedTokens.data() + ExpandedTokens.size()); + + auto FileIt = Files.find( + SourceMgr->getFileID(SourceMgr->getExpansionLoc(Expanded->location()))); + assert(FileIt != Files.end() && "no file for an expanded token"); + + const MarkedFile &File = FileIt->second; + + unsigned ExpandedIndex = Expanded - ExpandedTokens.data(); + // Find the first mapping that produced tokens after \p Expanded. + auto It = llvm::upper_bound( + File.Mappings, ExpandedIndex, + [](unsigned L, const Mapping &R) { return L < R.BeginExpandedToken; }); + // Our token could only be produced by the previous mapping. + if (It == File.Mappings.begin()) { + // No mapping could produce this mapping, pick the corresponding raw token. + return {&File.RawTokens[ExpandedIndex - File.BeginExpanded], nullptr}; + } + --It; // It now points to last mapping that started before our token. + + // Check if the token is part of the mapping. + if (ExpandedIndex < It->EndExpandedToken) + return {&File.RawTokens[It->BeginRawToken], /*Mapping*/ &*It}; + + // Not part of the mapping, use the index from previous mapping to compute the + // corresponding raw token. + return { + &File.RawTokens[It->EndRawToken + (ExpandedIndex - It->EndExpandedToken)], + /*Mapping*/ nullptr}; +} + +llvm::ArrayRef<syntax::Token> TokenBuffer::rawTokens(FileID FID) const { + auto It = Files.find(FID); + assert(It != Files.end()); + return It->second.RawTokens; +} + +std::string TokenBuffer::Mapping::str() const { + return llvm::formatv("raw tokens: [{0},{1}), expanded " + "tokens: [{2},{3})", + BeginRawToken, EndRawToken, BeginExpandedToken, + EndExpandedToken); +} + +llvm::Optional<FileRange> TokenBuffer::findOffsetsByExpanded( + llvm::ArrayRef<syntax::Token> Expanded) const { + auto Tokens = findRawByExpanded(Expanded); + if (!Tokens) + return llvm::None; + assert(!Tokens->empty()); + + FileRange R; + std::tie(R.File, R.Begin) = + SourceMgr->getDecomposedLoc(Tokens->front().location()); + R.End = SourceMgr->getFileOffset(Tokens->back().endLocation()); + return R; +} + +llvm::Optional<llvm::ArrayRef<syntax::Token>> +TokenBuffer::findRawByExpanded(llvm::ArrayRef<syntax::Token> Expanded) const { + // Mapping an empty range is not well-defined, bail out in that case. + if (Expanded.empty()) + return llvm::None; + + // FIXME: also allow changes uniquely mapping to macro arguments. + + const syntax::Token *BeginRawToken; + const Mapping *BeginMapping; + std::tie(BeginRawToken, BeginMapping) = expandedToRaw(&Expanded.front()); + + const syntax::Token *LastRawToken; + const Mapping *LastMapping; + std::tie(LastRawToken, LastMapping) = expandedToRaw(&Expanded.back()); + + FileID FID = SourceMgr->getFileID(BeginRawToken->location()); + // FIXME: Handle multi-file changes by trying to map onto a common root. + if (FID != SourceMgr->getFileID(LastRawToken->location())) + return llvm::None; + + const MarkedFile &File = Files.find(FID)->second; + + // Do not allow changes that cross macro expansion boundaries. + unsigned BeginExpanded = Expanded.begin() - ExpandedTokens.data(); + unsigned EndExpanded = Expanded.end() - ExpandedTokens.data(); + if (BeginMapping && BeginMapping->BeginExpandedToken < BeginExpanded) + return llvm::None; + if (LastMapping && EndExpanded < LastMapping->EndExpandedToken) + return llvm::None; + // All is good, return the result. + return llvm::makeArrayRef( + BeginMapping ? File.RawTokens.data() + BeginMapping->BeginRawToken + : BeginRawToken, + LastMapping ? File.RawTokens.data() + LastMapping->EndRawToken + : LastRawToken + 1); +} + +std::vector<syntax::Token> syntax::tokenize(FileID FID, const SourceManager &SM, + const LangOptions &LO) { + std::vector<syntax::Token> Tokens; + IdentifierTable Identifiers(LO); + auto AddToken = [&](clang::Token T) { + // Fill the proper token kind for keywords, etc. + if (T.getKind() == tok::raw_identifier && !T.needsCleaning() && + !T.hasUCN()) { // FIXME: support needsCleaning and hasUCN cases. + clang::IdentifierInfo &II = Identifiers.get(T.getRawIdentifier()); + T.setIdentifierInfo(&II); + T.setKind(II.getTokenID()); + } + Tokens.push_back(syntax::Token(T)); + }; + + Lexer L(FID, SM.getBuffer(FID), SM, LO); + + clang::Token T; + while (!L.LexFromRawLexer(T)) + AddToken(T); + AddToken(T); + + return Tokens; +} + +/// Fills in the TokenBuffer by tracing the run of a preprocessor. The +/// implementation tracks the tokens, macro expansions and directives coming +/// from the preprocessor and: +/// - for each token, figures out if it is a part of an expanded token stream, +/// raw token stream or both. Stores the tokens appropriately. +/// - records mappings from the raw to expanded token ranges, e.g. for macro +/// expansions. +class TokenCollector::Callbacks : public PPCallbacks { +public: + Callbacks(const SourceManager &SM, TokenBuffer &Result) + : Result(Result), SM(SM) {} + + void FileChanged(SourceLocation Loc, FileChangeReason Reason, + SrcMgr::CharacteristicKind FileType, + FileID PrevFID) override { + assert(Loc.isFileID()); + File = &Result.Files.try_emplace(SM.getFileID(Loc)).first->second; + flushMacroExpansion(); + } + + void tokenLexed(const clang::Token &T, TokenSource S) { + if (S == TokenSource::Precached) + return; // the cached tokens are reported multiple times, we have already + // recorded these. + + auto L = T.getLocation(); + flushCurrentExpansion(L); + + if (ExpansionStart.isValid() && SM.getExpansionLoc(L) != ExpansionStart) { + // The token comes from intermediate replacements while processing macro + // arguments. These are not part of the expanded token and we only record + // the top-level macro expansions, so skip this token. + return; + } + + // 'eod' is a control token that we don't capture. + if (T.getKind() == tok::eod) + return; + + DEBUG_WITH_TYPE("collect-tokens", { + llvm::dbgs() << "$[token] " << syntax::Token(T).str(SM) << "\n"; + }); + + // Depending on where the token comes from, put it into an expanded token + // stream, a raw token stream, or both. + switch (S) { + case TokenSource::File: + assert(T.getLocation().isFileID()); + Result.ExpandedTokens.push_back(syntax::Token(T)); + File->RawTokens.push_back(syntax::Token(T)); + break; + case clang::TokenSource::MacroExpansion: + assert(T.getLocation().isMacroID()); + Result.ExpandedTokens.push_back(syntax::Token(T)); + break; + case clang::TokenSource::MacroNameOrArg: + case TokenSource::MacroDirective: + case TokenSource::SkippedPPBranch: + assert(T.getLocation().isFileID()); + File->RawTokens.push_back(syntax::Token(T)); + break; + case TokenSource::Precached: + llvm_unreachable("cached tokens should be handled before"); + case TokenSource::AfterModuleImport: + llvm_unreachable("not implemented yet"); + } + } + + void MacroExpands(const clang::Token &MacroNameTok, const MacroDefinition &MD, + SourceRange Range, const MacroArgs *Args) override { + auto MacroNameLoc = MacroNameTok.getLocation(); + flushCurrentExpansion(MacroNameLoc); + + // We do not record recursive invocations. + if (isMacroExpanding()) + return; + + // Find the first raw token of the macro invocation, i.e. the name of the + // macro. + auto InvocationStart = llvm::find_if( + llvm::reverse(File->RawTokens), + [&](const syntax::Token &T) { return T.location() == MacroNameLoc; }); + assert(InvocationStart != File->RawTokens.rend() && + "macro name must be recorded."); + + // Add a raw-to-expanded mapping for this macro invocation. + TokenBuffer::Mapping M; + M.BeginRawToken = + std::prev(InvocationStart.base()) - File->RawTokens.begin(); + M.EndRawToken = File->RawTokens.size(); + + M.BeginExpandedToken = Result.ExpandedTokens.size(); + // MI.EndExpandedToken is filled by flushCurrentExpansion() when macro + // expansion finishes. + + File->Mappings.push_back(M); + + // We have to record where invocation ends in order to track it properly. + std::tie(MacroInvocationFile, ExpansionEndOffset) = + SM.getDecomposedLoc(Range.getEnd()); + this->ExpansionStart = Range.getBegin(); + } + +private: + bool isMacroExpanding() const { return MacroInvocationFile.isValid(); } + + void flushMacroExpansion() { + if (!MacroInvocationFile.isValid()) + return; + assert(!File->Mappings.empty()); + assert(File->Mappings.back().EndExpandedToken == 0); + File->Mappings.back().EndExpandedToken = Result.ExpandedTokens.size(); + + MacroInvocationFile = FileID(); + ExpansionStart = SourceLocation(); + ExpansionEndOffset = 0; + } + + void flushCurrentExpansion(SourceLocation L) { + assert(L.isValid()); + if (!MacroInvocationFile.isValid()) + return; + FileID File; + unsigned Offset; + std::tie(File, Offset) = SM.getDecomposedLoc(L); + // Note that we always get a token inside the same file after macro + // expansion finishes (eof would be the last token) + if (File != MacroInvocationFile || Offset <= ExpansionEndOffset) + return; + // Check we are not inside the current macro arguments. + flushMacroExpansion(); + } + + TokenBuffer::MarkedFile *File = nullptr; + /// When valid, the file of the last active top-level macro invocation. + FileID MacroInvocationFile; + SourceLocation ExpansionStart; + unsigned ExpansionEndOffset = 0; + TokenBuffer &Result; + const SourceManager &SM; +}; + +TokenCollector::TokenCollector(Preprocessor &PP) + : Tokens(PP.getSourceManager()) { + auto CBOwner = llvm::make_unique<Callbacks>(PP.getSourceManager(), Tokens); + auto *CB = CBOwner.get(); + + PP.addPPCallbacks(std::move(CBOwner)); + PP.setTokenWatcher( + [CB](const clang::Token &T, TokenSource S) { CB->tokenLexed(T, S); }); +} + +TokenBuffer TokenCollector::consume() && { return std::move(Tokens); } Index: clang/lib/Tooling/Syntax/CMakeLists.txt =================================================================== --- /dev/null +++ clang/lib/Tooling/Syntax/CMakeLists.txt @@ -0,0 +1,10 @@ +set(LLVM_LINK_COMPONENTS Support) + +add_clang_library(clangToolingSyntax + Tokens.cpp + + LINK_LIBS + clangBasic + clangFrontend + clangLex + ) Index: clang/lib/Tooling/CMakeLists.txt =================================================================== --- clang/lib/Tooling/CMakeLists.txt +++ clang/lib/Tooling/CMakeLists.txt @@ -7,6 +7,7 @@ add_subdirectory(Inclusions) add_subdirectory(Refactoring) add_subdirectory(ASTDiff) +add_subdirectory(Syntax) add_clang_library(clangTooling AllTUsExecution.cpp Index: clang/include/clang/Tooling/Syntax/Tokens.h =================================================================== --- /dev/null +++ clang/include/clang/Tooling/Syntax/Tokens.h @@ -0,0 +1,267 @@ +//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Record tokens that a preprocessor emits and define operations to map between +// the tokens written in a file and tokens produced by the preprocessor. +// +// When running the compiler, there are two token streams we are interested in: +// - "raw" tokens directly correspond to a substring written in some source +// file. +// - "expanded" tokens represent the result of preprocessing, parses consumes +// this token stream to produce the AST. +// +// Expanded tokens correspond directly to locations found in the AST, allowing +// to find subranges of the token stream covered by various AST nodes. Raw +// tokens correspond directly to the source code written by the user. +// +// To allow composing these two use-cases, we also define operations that map +// between expanded and raw tokens that produced them (macro calls, directives, +// etc). +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H +#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H + +#include "clang/Basic/FileManager.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/raw_ostream.h" +#include <cstdint> +#include <tuple> + +namespace clang { +class Preprocessor; + +namespace syntax { +class TokenBuffer; + +/// A token coming directly from a file or from a macro invocation. Has just +/// enough information to locate the token in the source code. +/// Used to represent both expanded and raw tokens. +class Token { +public: + Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind) + : Location(Location), Length(Length), Kind(Kind) {} + /// EXPECTS: clang::Token is not an annotation token. + explicit Token(const clang::Token &T); + + tok::TokenKind kind() const { return Kind; } + SourceLocation location() const { return Location; } + SourceLocation endLocation() const { + return Location.getLocWithOffset(Length); + } + unsigned length() const { return Length; } + + /// Get the substring covered by the token. Note that will include all + /// digraphs, newline continuations, etc. E.g. tokens for 'int' and + /// in\ + /// t + /// both have the same kind tok::kw_int, but results of text() are different. + llvm::StringRef text(const SourceManager &SM) const; + + /// For debugging purposes. More verbose than the other overload, but requries + /// a source manager. + std::string str(const SourceManager &SM) const; + /// For debugging purposes. + std::string str() const; + +private: + SourceLocation Location; + unsigned Length; + tok::TokenKind Kind; +}; +/// For debugging purposes. Equivalent to a call to Token::str(). +llvm::raw_ostream& operator<<(llvm::raw_ostream &OS, const Token &T); + +/// A half-open range inside a particular file, the start offset is included and +/// the end offset is excluded from the range. +struct FileRange { + FileID File; + /// Start offset (inclusive) in a corresponding file. + unsigned Begin = 0; + /// End offset (exclusive) in a corresponding file. + unsigned End = 0; +}; +inline bool operator==(const FileRange &L, const FileRange &R) { + return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End); +} +inline bool operator!=(const FileRange &L, const FileRange &R) { + return !(L == R); +} +/// For debugging purposes. +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); + +/// A list of tokens obtained by preprocessing a text buffer and operations to +/// map between the expanded and raw tokens, i.e. TokenBuffer has information +/// about two token streams: +/// 1. Expanded tokens: tokens produced by the preprocessor after all macro +/// replacements, +/// 2. Raw tokens: corresponding directly to the source code of a file before +/// any macro replacements occurred. +/// Here's an example to illustrate a difference between those two: +/// #define FOO 10 +/// int a = FOO; +/// +/// Raw tokens are {'#', 'define', 'FOO', '10', 'int', 'a', '=', 'FOO', ';'}. +/// Expanded tokens are {'int', 'a', '=', '10', ';'}. +/// +/// The full list expanded tokens can be obtained with expandedTokens(). Raw +/// tokens for each of the files can be obtained via rawTokens(FileID). +/// +/// To map between the expanded and raw token streams, see findRawByExpanded(). +/// +/// To build a token buffer use the TokenCollector class. You can also compute +/// the raw tokens of a file using the tokenize() helper. +class TokenBuffer { +public: + TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {} + /// All tokens produced by the preprocessor after all macro replacements, + /// directives, etc. Source locations found in the clang AST will always + /// point to one of these tokens. + /// FIXME: the notable exception is '>>' being split into two '>'. figure out + /// how to deal with it. + llvm::ArrayRef<syntax::Token> expandedTokens() const { + return ExpandedTokens; + } + + /// Attempt to find the subrange of raw tokens that produced the corresponding + /// \p Expanded tokens. Will fail if the raw tokens cannot be determined + /// unambiguously. E.g. for the following example: + /// + /// #define FIRST f1 f2 f3 + /// #define SECOND s1 s2 s3 + /// + /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c + /// + /// the results would be: + /// expanded => raw + /// ------------------------ + /// a => a + /// s1 s2 s3 => SECOND + /// a f1 f2 f3 => a FIRST + /// a f1 => can't map + /// s1 s2 => can't map + /// + /// If \p Expanded is empty, the returned value is llvm::None. + /// Complexity is logarithmic. + llvm::Optional<llvm::ArrayRef<syntax::Token>> + findRawByExpanded(llvm::ArrayRef<syntax::Token> Expanded) const; + + /// Obtain the text offsets corresponding to the tokens returned by + /// findRawByExpanded. + llvm::Optional<FileRange> + findOffsetsByExpanded(llvm::ArrayRef<syntax::Token> Expanded) const; + + /// Lexed tokens of a file before preprocessing. E.g. for the following input + /// #define DECL(name) int name = 10 + /// DECL(a); + /// rawTokens() returns {"#", "define", "DECL", "(", "name", ")"}. + /// FIXME: we do not yet store tokens of directives, like #include, #define, + /// #pragma, etc. + llvm::ArrayRef<syntax::Token> rawTokens(FileID FID) const; + + /// For debugging purposes. + void dump(llvm::raw_ostream &OS) const; + +private: + /// Describes a mapping between a continuous subrange of raw tokens and the + /// expanded tokens. Represents macro expansions, preprocessor directives, + /// conditionally disabled pp regions, etc. + /// #define FOO 1+2 + /// #define BAR(a) a + 1 + /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}. + /// BAR(1) // invocation #2, tokens = {'a', '+', '1'}, + /// macroTokens = {'BAR', '(', '1', ')'}. + struct Mapping { + // Positions in the corresponding raw token stream. The corresponding range + // is never empty. + unsigned BeginRawToken = 0; + unsigned EndRawToken = 0; + // Positions in the expanded token stream. The corresponding range can be + // empty. + unsigned BeginExpandedToken = 0; + unsigned EndExpandedToken = 0; + + /// For debugging purposes. + std::string str() const; + }; + /// Raw tokens of the file with information about the subranges. + struct MarkedFile { + /// Lexed, but not preprocessed, tokens of the file. These map directly to + /// text in the corresponding files and include tokens of all preprocessor + /// directives. + /// FIXME: raw tokens don't change across FileID that map to the same + /// FileEntry. We could consider deduplicating them to save memory. + std::vector<syntax::Token> RawTokens; + /// A sorted list to convert between the raw and expanded token streams. + std::vector<Mapping> Mappings; + /// The first expanded token produced for this FileID. + unsigned BeginExpanded = 0; + unsigned EndExpanded = 0; + }; + + friend class TokenCollector; + // Testing code has access to internal mapping. + friend class TokensTest; + + /// Maps a single expanded token to its raw counterpart or a mapping that + /// produced it. + std::pair<const syntax::Token *, const Mapping *> + expandedToRaw(const syntax::Token *Expanded) const; + + /// Token stream produced after preprocessing, conceputally this captures the + /// same stream as 'clang -E' (excluding the preprocessor directives like + /// #file, etc.). + std::vector<syntax::Token> ExpandedTokens; + llvm::DenseMap<FileID, MarkedFile> Files; + // The value is never null, pointer instead of reference to avoid disabling + // implicit assignment operator. + const SourceManager *SourceMgr; +}; + +/// Lex the text buffer, corresponding to \p FID, in raw mode and record the +/// resulting tokens. Does minimal post-processing on raw identifiers, setting +/// the appropriate token kind (instead of the raw_identifier reported by lexer +/// in raw mode). This is a very low-level function, most users should prefer to +/// use TokenCollector. Lexing in raw mode produces wildly different results +/// from what one might expect when running a C++ frontend, e.g. preprocessor +/// does not run at all. +std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM, + const LangOptions &LO); + +/// Collects tokens for the main file while running the frontend action. An +/// instance of this object should be created on +/// FrontendAction::BeginSourceFile() and the results should be consumed after +/// FrontendAction::Execute() finishes. +class TokenCollector { +public: + /// Adds the hooks to collect the tokens. Should be called before the + /// preprocessing starts, i.e. as a part of BeginSourceFile() or + /// CreateASTConsumer(). + TokenCollector(Preprocessor &P); + + /// Finalizes token collection. Should be called after preprocessing is + /// finished, i.e. after running Execute(). + LLVM_NODISCARD TokenBuffer consume() &&; + +private: + class Callbacks; + TokenBuffer Tokens; +}; + +} // namespace syntax +} // namespace clang + +#endif
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits