ilya-biryukov updated this revision to Diff 198868. ilya-biryukov marked 2 inline comments as done. ilya-biryukov added a comment.
- Use bsearch instead of upper_bound Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D59887/new/ https://reviews.llvm.org/D59887 Files: clang/include/clang/Tooling/Syntax/Tokens.h clang/lib/Tooling/CMakeLists.txt clang/lib/Tooling/Syntax/CMakeLists.txt clang/lib/Tooling/Syntax/Tokens.cpp clang/unittests/Tooling/CMakeLists.txt clang/unittests/Tooling/Syntax/CMakeLists.txt clang/unittests/Tooling/Syntax/TokensTest.cpp
Index: clang/unittests/Tooling/Syntax/TokensTest.cpp =================================================================== --- /dev/null +++ clang/unittests/Tooling/Syntax/TokensTest.cpp @@ -0,0 +1,622 @@ +//===- TokensTest.cpp -----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Tokens.h" +#include "clang/AST/ASTConsumer.h" +#include "clang/AST/Expr.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticIDs.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/FileSystemOptions.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.def" +#include "clang/Basic/TokenKinds.h" +#include "clang/Frontend/CompilerInstance.h" +#include "clang/Frontend/FrontendAction.h" +#include "clang/Frontend/Utils.h" +#include "clang/Lex/Lexer.h" +#include "clang/Lex/PreprocessorOptions.h" +#include "clang/Lex/Token.h" +#include "clang/Tooling/Tooling.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/IntrusiveRefCntPtr.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Support/raw_os_ostream.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Testing/Support/Annotations.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gmock/gmock-matchers.h" +#include "gmock/gmock-more-matchers.h" +#include <cassert> +#include <cstdlib> +#include <gmock/gmock.h> +#include <gtest/gtest.h> +#include <memory> +#include <ostream> +#include <string> + +using namespace clang; +using namespace clang::syntax; + +using llvm::ValueIs; +using ::testing::AllOf; +using ::testing::Contains; +using ::testing::ElementsAre; +using ::testing::Matcher; +using ::testing::Not; +using ::testing::StartsWith; + +namespace { +// Checks the passed ArrayRef<T> has the same begin() and end() iterators as the +// argument. +MATCHER_P(SameRange, A, "") { + return A.begin() == arg.begin() && A.end() == arg.end(); +} +// Matchers for syntax::Token. +MATCHER_P(Kind, K, "") { return arg.kind() == K; } +MATCHER_P2(HasText, Text, SourceMgr, "") { + return arg.text(*SourceMgr) == Text; +} +/// Checks the start and end location of a token are equal to SourceRng. +MATCHER_P(RangeIs, SourceRng, "") { + return arg.location() == SourceRng.first && + arg.endLocation() == SourceRng.second; +} + +class TokenCollectorTest : public ::testing::Test { +public: + /// Run the clang frontend, collect the preprocessed tokens from the frontend + /// invocation and store them in this->Buffer. + /// This also clears SourceManager before running the compiler. + void recordTokens(llvm::StringRef Code) { + class RecordTokens : public ASTFrontendAction { + public: + explicit RecordTokens(TokenBuffer &Result) : Result(Result) {} + + bool BeginSourceFileAction(CompilerInstance &CI) override { + assert(!Collector && "expected only a single call to BeginSourceFile"); + Collector.emplace(CI.getPreprocessor()); + return true; + } + void EndSourceFileAction() override { + assert(Collector && "BeginSourceFileAction was never called"); + Result = std::move(*Collector).consume(); + } + + std::unique_ptr<ASTConsumer> + CreateASTConsumer(CompilerInstance &CI, StringRef InFile) override { + return llvm::make_unique<ASTConsumer>(); + } + + private: + TokenBuffer &Result; + llvm::Optional<TokenCollector> Collector; + }; + + constexpr const char *FileName = "./input.cpp"; + FS->addFile(FileName, time_t(), llvm::MemoryBuffer::getMemBufferCopy("")); + // Prepare to run a compiler. + std::vector<const char *> Args = {"tok-test", "-std=c++03", "-fsyntax-only", + FileName}; + auto CI = createInvocationFromCommandLine(Args, Diags, FS); + assert(CI); + CI->getFrontendOpts().DisableFree = false; + CI->getPreprocessorOpts().addRemappedFile( + FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release()); + CompilerInstance Compiler; + Compiler.setInvocation(std::move(CI)); + if (!Diags->getClient()) + Diags->setClient(new IgnoringDiagConsumer); + Compiler.setDiagnostics(Diags.get()); + Compiler.setFileManager(FileMgr.get()); + Compiler.setSourceManager(SourceMgr.get()); + + this->Buffer = TokenBuffer(*SourceMgr); + RecordTokens Recorder(this->Buffer); + ASSERT_TRUE(Compiler.ExecuteAction(Recorder)) + << "failed to run the frontend"; + } + + /// Record the tokens and return a test dump of the resulting buffer. + std::string collectAndDump(llvm::StringRef Code) { + recordTokens(Code); + return Buffer.dumpForTests(); + } + + // Adds a file to the test VFS. + void addFile(llvm::StringRef Path, llvm::StringRef Contents) { + if (!FS->addFile(Path, time_t(), + llvm::MemoryBuffer::getMemBufferCopy(Contents))) { + ADD_FAILURE() << "could not add a file to VFS: " << Path; + } + } + + /// Add a new file, run syntax::tokenize() on it and return the results. + std::vector<syntax::Token> tokenize(llvm::StringRef Text) { + // FIXME: pass proper LangOptions. + return syntax::tokenize( + SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy(Text)), + *SourceMgr, LangOptions()); + } + + // Specialized versions of matchers that hide the SourceManager from clients. + Matcher<syntax::Token> HasText(std::string Text) const { + return ::HasText(Text, SourceMgr.get()); + } + Matcher<syntax::Token> RangeIs(llvm::Annotations::Range R) const { + std::pair<SourceLocation, SourceLocation> Ls; + Ls.first = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID()) + .getLocWithOffset(R.Begin); + Ls.second = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID()) + .getLocWithOffset(R.End); + return ::RangeIs(Ls); + } + + /// Finds a subrange in O(n * m). + template <class T, class U, class Eq> + llvm::ArrayRef<T> findSubrange(llvm::ArrayRef<U> Subrange, + llvm::ArrayRef<T> Range, Eq F) { + for (auto Begin = Range.begin(); Begin < Range.end(); ++Begin) { + auto It = Begin; + for (auto ItSub = Subrange.begin(); + ItSub != Subrange.end() && It != Range.end(); ++ItSub, ++It) { + if (!F(*ItSub, *It)) + goto continue_outer; + } + return llvm::makeArrayRef(Begin, It); + continue_outer:; + } + return llvm::makeArrayRef(Range.end(), Range.end()); + } + + /// Finds a subrange in \p Tokens that match the tokens specified in \p Query. + /// The match should be unique. \p Query is a whitespace-separated list of + /// tokens to search for. + llvm::ArrayRef<syntax::Token> + findTokenRange(llvm::StringRef Query, llvm::ArrayRef<syntax::Token> Tokens) { + llvm::SmallVector<llvm::StringRef, 8> QueryTokens; + Query.split(QueryTokens, ' ', /*MaxSplit=*/-1, /*KeepEmpty=*/false); + if (QueryTokens.empty()) { + ADD_FAILURE() << "will not look for an empty list of tokens"; + std::abort(); + } + // An equality test for search. + auto TextMatches = [this](llvm::StringRef Q, const syntax::Token &T) { + return Q == T.text(*SourceMgr); + }; + // Find a match. + auto Found = + findSubrange(llvm::makeArrayRef(QueryTokens), Tokens, TextMatches); + if (Found.begin() == Tokens.end()) { + ADD_FAILURE() << "could not find the subrange for " << Query; + std::abort(); + } + // Check that the match is unique. + if (findSubrange(llvm::makeArrayRef(QueryTokens), + llvm::makeArrayRef(Found.end(), Tokens.end()), TextMatches) + .begin() != Tokens.end()) { + ADD_FAILURE() << "match is not unique for " << Query; + std::abort(); + } + return Found; + }; + + // Specialized versions of findTokenRange for expanded and spelled tokens. + llvm::ArrayRef<syntax::Token> findExpanded(llvm::StringRef Query) { + return findTokenRange(Query, Buffer.expandedTokens()); + } + llvm::ArrayRef<syntax::Token> findSpelled(llvm::StringRef Query, + FileID File = FileID()) { + if (!File.isValid()) + File = SourceMgr->getMainFileID(); + return findTokenRange(Query, Buffer.spelledTokens(File)); + } + + // Data fields. + llvm::IntrusiveRefCntPtr<DiagnosticsEngine> Diags = + new DiagnosticsEngine(new DiagnosticIDs, new DiagnosticOptions); + IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS = + new llvm::vfs::InMemoryFileSystem; + llvm::IntrusiveRefCntPtr<FileManager> FileMgr = + new FileManager(FileSystemOptions(), FS); + llvm::IntrusiveRefCntPtr<SourceManager> SourceMgr = + new SourceManager(*Diags, *FileMgr); + /// Contains last result of calling recordTokens(). + TokenBuffer Buffer = TokenBuffer(*SourceMgr); +}; + +TEST_F(TokenCollectorTest, RawMode) { + EXPECT_THAT(tokenize("int main() {}"), + ElementsAre(Kind(tok::kw_int), + AllOf(HasText("main"), Kind(tok::identifier)), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace), Kind(tok::r_brace))); + // Comments are ignored for now. + EXPECT_THAT(tokenize("/* foo */int a; // more comments"), + ElementsAre(Kind(tok::kw_int), + AllOf(HasText("a"), Kind(tok::identifier)), + Kind(tok::semi))); +} + +TEST_F(TokenCollectorTest, Basic) { + std::pair</*Input*/ std::string, /*Expected*/ std::string> TestCases[] = { + {"int main() {}", + R"(expanded tokens: + int main ( ) { } +file './input.cpp' + spelled tokens: + int main ( ) { } + no mappings. +)"}, + // All kinds of whitespace are ignored. + {"\t\n int\t\n main\t\n (\t\n )\t\n{\t\n }\t\n", + R"(expanded tokens: + int main ( ) { } +file './input.cpp' + spelled tokens: + int main ( ) { } + no mappings. +)"}}; + for (auto &Test : TestCases) + EXPECT_EQ(collectAndDump(Test.first), Test.second); +} + +TEST_F(TokenCollectorTest, Locations) { + // Check locations of the tokens. + llvm::Annotations Code(R"cpp( + $r1[[int]] $r2[[a]] $r3[[=]] $r4[["foo bar baz"]] $r5[[;]] + )cpp"); + recordTokens(Code.code()); + // Check expanded tokens. + EXPECT_THAT( + Buffer.expandedTokens(), + ElementsAre(AllOf(Kind(tok::kw_int), RangeIs(Code.range("r1"))), + AllOf(Kind(tok::identifier), RangeIs(Code.range("r2"))), + AllOf(Kind(tok::equal), RangeIs(Code.range("r3"))), + AllOf(Kind(tok::string_literal), RangeIs(Code.range("r4"))), + AllOf(Kind(tok::semi), RangeIs(Code.range("r5"))), + Kind(tok::eof))); + // Check spelled tokens. + EXPECT_THAT( + Buffer.spelledTokens(SourceMgr->getMainFileID()), + ElementsAre(AllOf(Kind(tok::kw_int), RangeIs(Code.range("r1"))), + AllOf(Kind(tok::identifier), RangeIs(Code.range("r2"))), + AllOf(Kind(tok::equal), RangeIs(Code.range("r3"))), + AllOf(Kind(tok::string_literal), RangeIs(Code.range("r4"))), + AllOf(Kind(tok::semi), RangeIs(Code.range("r5"))))); +} + +TEST_F(TokenCollectorTest, MacroDirectives) { + // Macro directives are not stored anywhere at the moment. + std::string Code = R"cpp( + #define FOO a + #include "unresolved_file.h" + #undef FOO + #ifdef X + #else + #endif + #ifndef Y + #endif + #if 1 + #elif 2 + #else + #endif + #pragma once + #pragma something lalala + + int a; + )cpp"; + std::string Expected = + "expanded tokens:\n" + " int a ;\n" + "file './input.cpp'\n" + " spelled tokens:\n" + " # define FOO a # include \"unresolved_file.h\" # undef FOO " + "# ifdef X # else # endif # ifndef Y # endif # if 1 # elif 2 # else " + "# endif # pragma once # pragma something lalala int a ;\n" + " mappings:\n" + " ['#'_0, 'int'_39) => ['int'_0, 'int'_0)\n"; + EXPECT_EQ(collectAndDump(Code), Expected); +} + +TEST_F(TokenCollectorTest, MacroReplacements) { + std::pair</*Input*/ std::string, /*Expected*/ std::string> TestCases[] = { + // A simple object-like macro. + {R"cpp( + #define INT int const + INT a; + )cpp", + R"(expanded tokens: + int const a ; +file './input.cpp' + spelled tokens: + # define INT int const INT a ; + mappings: + ['#'_0, 'INT'_5) => ['int'_0, 'int'_0) + ['INT'_5, 'a'_6) => ['int'_0, 'a'_2) +)"}, + // A simple function-like macro. + {R"cpp( + #define INT(a) const int + INT(10+10) a; + )cpp", + R"(expanded tokens: + const int a ; +file './input.cpp' + spelled tokens: + # define INT ( a ) const int INT ( 10 + 10 ) a ; + mappings: + ['#'_0, 'INT'_8) => ['const'_0, 'const'_0) + ['INT'_8, 'a'_14) => ['const'_0, 'a'_2) +)"}, + // Recursive macro replacements. + {R"cpp( + #define ID(X) X + #define INT int const + ID(ID(INT)) a; + )cpp", + R"(expanded tokens: + int const a ; +file './input.cpp' + spelled tokens: + # define ID ( X ) X # define INT int const ID ( ID ( INT ) ) a ; + mappings: + ['#'_0, 'ID'_12) => ['int'_0, 'int'_0) + ['ID'_12, 'a'_19) => ['int'_0, 'a'_2) +)"}, + // A little more complicated recursive macro replacements. + {R"cpp( + #define ADD(X, Y) X+Y + #define MULT(X, Y) X*Y + + int a = ADD(MULT(1,2), MULT(3,ADD(4,5))); + )cpp", + "expanded tokens:\n" + " int a = 1 * 2 + 3 * 4 + 5 ;\n" + "file './input.cpp'\n" + " spelled tokens:\n" + " # define ADD ( X , Y ) X + Y # define MULT ( X , Y ) X * Y int " + "a = ADD ( MULT ( 1 , 2 ) , MULT ( 3 , ADD ( 4 , 5 ) ) ) ;\n" + " mappings:\n" + " ['#'_0, 'int'_22) => ['int'_0, 'int'_0)\n" + " ['ADD'_25, ';'_46) => ['1'_3, ';'_12)\n"}, + // Empty macro replacement. + {R"cpp( + #define EMPTY + #define EMPTY_FUNC(X) + EMPTY + EMPTY_FUNC(1+2+3) + )cpp", + R"(expanded tokens: + <empty> +file './input.cpp' + spelled tokens: + # define EMPTY # define EMPTY_FUNC ( X ) EMPTY EMPTY_FUNC ( 1 + 2 + 3 ) + mappings: + ['#'_0, '<eof>'_18) => ['<eof>'_0, '<eof>'_0) +)"}, + // File ends with a macro replacement. + {R"cpp( + #define FOO 10+10; + int a = FOO + )cpp", + R"(expanded tokens: + int a = 10 + 10 ; +file './input.cpp' + spelled tokens: + # define FOO 10 + 10 ; int a = FOO + mappings: + ['#'_0, 'int'_7) => ['int'_0, 'int'_0) + ['FOO'_10, '<eof>'_11) => ['10'_3, '<eof>'_7) +)"}}; + + for (auto &Test : TestCases) + EXPECT_EQ(Test.second, collectAndDump(Test.first)) + << collectAndDump(Test.first); +} + +TEST_F(TokenCollectorTest, SpecialTokens) { + // Tokens coming from concatenations. + recordTokens(R"cpp( + #define CONCAT(a, b) a ## b + int a = CONCAT(1, 2); + )cpp"); + EXPECT_THAT(std::vector<syntax::Token>(Buffer.expandedTokens()), + Contains(HasText("12"))); + // Multi-line tokens with slashes at the end. + recordTokens("i\\\nn\\\nt"); + EXPECT_THAT(Buffer.expandedTokens(), + ElementsAre(AllOf(Kind(tok::kw_int), HasText("i\\\nn\\\nt")), + Kind(tok::eof))); + // FIXME: test tokens with digraphs and UCN identifiers. +} + +TEST_F(TokenCollectorTest, LateBoundTokens) { + // The parser eventually breaks the first '>>' into two tokens ('>' and '>'), + // but we choose to record them as a single token (for now). + llvm::Annotations Code(R"cpp( + template <class T> + struct foo { int a; }; + int bar = foo<foo<int$br[[>>]]().a; + int baz = 10 $op[[>>]] 2; + )cpp"); + recordTokens(Code.code()); + EXPECT_THAT(std::vector<syntax::Token>(Buffer.expandedTokens()), + AllOf(Contains(AllOf(Kind(tok::greatergreater), + RangeIs(Code.range("br")))), + Contains(AllOf(Kind(tok::greatergreater), + RangeIs(Code.range("op")))))); +} + +TEST_F(TokenCollectorTest, DelayedParsing) { + llvm::StringLiteral Code = R"cpp( + struct Foo { + int method() { + // Parser will visit method bodies and initializers multiple times, but + // TokenBuffer should only record the first walk over the tokens; + return 100; + } + int a = 10; + + struct Subclass { + void foo() { + Foo().method(); + } + }; + }; + )cpp"; + std::string ExpectedTokens = + "expanded tokens:\n" + " struct Foo { int method ( ) { return 100 ; } int a = 10 ; struct " + "Subclass { void foo ( ) { Foo ( ) . method ( ) ; } } ; } ;\n"; + EXPECT_THAT(collectAndDump(Code), StartsWith(ExpectedTokens)); +} + +TEST_F(TokenCollectorTest, MultiFile) { + addFile("./foo.h", R"cpp( + #define ADD(X, Y) X+Y + int a = 100; + #include "bar.h" + )cpp"); + addFile("./bar.h", R"cpp( + int b = ADD(1, 2); + #define MULT(X, Y) X*Y + )cpp"); + llvm::StringLiteral Code = R"cpp( + #include "foo.h" + int c = ADD(1, MULT(2,3)); + )cpp"; + + std::string Expected = R"(expanded tokens: + int a = 100 ; int b = 1 + 2 ; int c = 1 + 2 * 3 ; +file './input.cpp' + spelled tokens: + # include "foo.h" int c = ADD ( 1 , MULT ( 2 , 3 ) ) ; + mappings: + ['#'_0, 'int'_3) => ['int'_12, 'int'_12) + ['ADD'_6, ';'_17) => ['1'_15, ';'_20) +file './foo.h' + spelled tokens: + # define ADD ( X , Y ) X + Y int a = 100 ; # include "bar.h" + mappings: + ['#'_0, 'int'_11) => ['int'_0, 'int'_0) + ['#'_16, '<eof>'_19) => ['int'_5, 'int'_5) +file './bar.h' + spelled tokens: + int b = ADD ( 1 , 2 ) ; # define MULT ( X , Y ) X * Y + mappings: + ['ADD'_3, ';'_9) => ['1'_8, ';'_11) + ['#'_10, '<eof>'_21) => ['int'_12, 'int'_12) +)"; + + EXPECT_EQ(Expected, collectAndDump(Code)) + << "input: " << Code << "\nresults: " << collectAndDump(Code); +} + +class TokenBufferTest : public TokenCollectorTest {}; + +TEST_F(TokenBufferTest, SpelledByExpanded) { + recordTokens(R"cpp( + a1 a2 a3 b1 b2 + )cpp"); + + // Sanity check: expanded and spelled tokens are stored separately. + EXPECT_THAT(findExpanded("a1 a2"), Not(SameRange(findSpelled("a1 a2")))); + // Searching for subranges of expanded tokens should give the corresponding + // spelled ones. + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("a1 a2 a3 b1 b2")), + ValueIs(SameRange(findSpelled("a1 a2 a3 b1 b2")))); + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("a1 a2 a3")), + ValueIs(SameRange(findSpelled("a1 a2 a3")))); + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("b1 b2")), + ValueIs(SameRange(findSpelled("b1 b2")))); + + // Test search on simple macro expansions. + recordTokens(R"cpp( + #define A a1 a2 a3 + #define B b1 b2 + + A split B + )cpp"); + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("a1 a2 a3 split b1 b2")), + ValueIs(SameRange(findSpelled("A split B")))); + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("a1 a2 a3")), + ValueIs(SameRange(findSpelled("A split").drop_back()))); + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("b1 b2")), + ValueIs(SameRange(findSpelled("split B").drop_front()))); + // Ranges not fully covering macro invocations should fail. + EXPECT_EQ(Buffer.spelledForExpanded(findExpanded("a1 a2")), llvm::None); + EXPECT_EQ(Buffer.spelledForExpanded(findExpanded("b2")), llvm::None); + EXPECT_EQ(Buffer.spelledForExpanded(findExpanded("a2 a3 split b1 b2")), + llvm::None); + + // Recursive macro invocations. + recordTokens(R"cpp( + #define ID(x) x + #define B b1 b2 + + ID(ID(ID(a1) a2 a3)) split ID(B) + )cpp"); + + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("a1 a2 a3")), + ValueIs(SameRange(findSpelled("ID ( ID ( ID ( a1 ) a2 a3 ) )")))); + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("b1 b2")), + ValueIs(SameRange(findSpelled("ID ( B )")))); + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("a1 a2 a3 split b1 b2")), + ValueIs(SameRange(findSpelled( + "ID ( ID ( ID ( a1 ) a2 a3 ) ) split ID ( B )")))); + // Ranges crossing macro call boundaries. + EXPECT_EQ(Buffer.spelledForExpanded(findExpanded("a1 a2 a3 split b1")), + llvm::None); + EXPECT_EQ(Buffer.spelledForExpanded(findExpanded("a2 a3 split b1")), + llvm::None); + // FIXME: next two examples should map to macro arguments, but currently they + // fail. + EXPECT_EQ(Buffer.spelledForExpanded(findExpanded("a2")), llvm::None); + EXPECT_EQ(Buffer.spelledForExpanded(findExpanded("a1 a2")), llvm::None); + + // Empty macro expansions. + recordTokens(R"cpp( + #define EMPTY + #define ID(X) X + + EMPTY EMPTY ID(1 2 3) EMPTY EMPTY split1 + EMPTY EMPTY ID(4 5 6) split2 + ID(7 8 9) EMPTY EMPTY + )cpp"); + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("1 2 3")), + ValueIs(SameRange(findSpelled("ID ( 1 2 3 )")))); + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("4 5 6")), + ValueIs(SameRange(findSpelled("ID ( 4 5 6 )")))); + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("7 8 9")), + ValueIs(SameRange(findSpelled("ID ( 7 8 9 )")))); + + // Empty mappings coming from various directives. + recordTokens(R"cpp( + #define ID(X) X + ID(1) + #pragma lalala + not_mapped + )cpp"); + EXPECT_THAT(Buffer.spelledForExpanded(findExpanded("not_mapped")), + ValueIs(SameRange(findSpelled("not_mapped")))); +} + +// FIXME: add tests for mapping spelled tokens into offsets. + +} // namespace Index: clang/unittests/Tooling/Syntax/CMakeLists.txt =================================================================== --- /dev/null +++ clang/unittests/Tooling/Syntax/CMakeLists.txt @@ -0,0 +1,20 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Support + ) + +add_clang_unittest(TokensTest + TokensTest.cpp +) + +target_link_libraries(TokensTest + PRIVATE + clangAST + clangBasic + clangFrontend + clangLex + clangSerialization + clangTooling + clangToolingSyntax + LLVMTestingSupport + ) Index: clang/unittests/Tooling/CMakeLists.txt =================================================================== --- clang/unittests/Tooling/CMakeLists.txt +++ clang/unittests/Tooling/CMakeLists.txt @@ -70,3 +70,6 @@ clangToolingInclusions clangToolingRefactor ) + + +add_subdirectory(Syntax) Index: clang/lib/Tooling/Syntax/Tokens.cpp =================================================================== --- /dev/null +++ clang/lib/Tooling/Syntax/Tokens.cpp @@ -0,0 +1,499 @@ +//===- Tokens.cpp - collect tokens from preprocessing ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "clang/Tooling/Syntax/Tokens.h" + +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <string> +#include <utility> +#include <vector> + +using namespace clang; +using namespace clang::syntax; + +syntax::Token::Token(const clang::Token &T) + : Token(T.getLocation(), T.getLength(), T.getKind()) { + assert(!T.isAnnotation()); +} + +llvm::StringRef syntax::Token::text(const SourceManager &SM) const { + bool Invalid = false; + const char *Start = SM.getCharacterData(location(), &Invalid); + assert(!Invalid); + return llvm::StringRef(Start, length()); +} + +llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, const Token &T) { + return OS << T.str(); +} + +FileRange::FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset) + : File(File), Begin(BeginOffset), End(EndOffset) { + assert(File.isValid()); + assert(BeginOffset <= EndOffset); +} + +FileRange::FileRange(const SourceManager &SM, SourceLocation BeginLoc, + unsigned Length) { + assert(BeginLoc.isValid()); + assert(BeginLoc.isFileID()); + + std::tie(File, Begin) = SM.getDecomposedLoc(BeginLoc); + End = Begin + Length; +} +FileRange::FileRange(const SourceManager &SM, SourceLocation BeginLoc, + SourceLocation EndLoc) { + assert(BeginLoc.isValid()); + assert(BeginLoc.isFileID()); + assert(EndLoc.isValid()); + assert(EndLoc.isFileID()); + assert(SM.getFileID(BeginLoc) == SM.getFileID(EndLoc)); + assert(SM.getFileOffset(BeginLoc) <= SM.getFileOffset(EndLoc)); + + std::tie(File, Begin) = SM.getDecomposedLoc(BeginLoc); + End = SM.getFileOffset(EndLoc); +} + +llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, + const FileRange &R) { + return OS << llvm::formatv("FileRange(file = {0}, offsets = {1}-{2})", + R.file().getHashValue(), R.beginOffset(), + R.endOffset()); +} + +llvm::StringRef FileRange::text(const SourceManager &SM) const { + bool Invalid = false; + StringRef Text = SM.getBufferData(File, &Invalid); + if (Invalid) + return ""; + assert(Begin <= Text.size()); + assert(End <= Text.size()); + return Text.substr(Begin, length()); +} + +std::pair<const syntax::Token *, const TokenBuffer::Mapping *> +TokenBuffer::spelledForExpandedToken(const syntax::Token *Expanded) const { + assert(Expanded); + assert(ExpandedTokens.data() <= Expanded && + Expanded < ExpandedTokens.data() + ExpandedTokens.size()); + + auto FileIt = Files.find( + SourceMgr->getFileID(SourceMgr->getExpansionLoc(Expanded->location()))); + assert(FileIt != Files.end() && "no file for an expanded token"); + + const MarkedFile &File = FileIt->second; + + unsigned ExpandedIndex = Expanded - ExpandedTokens.data(); + // Find the first mapping that produced tokens after \p Expanded. + auto It = llvm::bsearch(File.Mappings, [&](const Mapping &M) { + return ExpandedIndex < M.BeginExpanded; + }); + // Our token could only be produced by the previous mapping. + if (It == File.Mappings.begin()) { + // No previous mapping, no need to modify offsets. + return {&File.SpelledTokens[ExpandedIndex - File.BeginExpanded], nullptr}; + } + --It; // 'It' now points to last mapping that started before our token. + + // Check if the token is part of the mapping. + if (ExpandedIndex < It->EndExpanded) + return {&File.SpelledTokens[It->BeginSpelled], /*Mapping*/ &*It}; + + // Not part of the mapping, use the index from previous mapping to compute the + // corresponding spelled token. + return { + &File.SpelledTokens[It->EndSpelled + (ExpandedIndex - It->EndExpanded)], + /*Mapping*/ nullptr}; +} + +llvm::ArrayRef<syntax::Token> TokenBuffer::spelledTokens(FileID FID) const { + auto It = Files.find(FID); + assert(It != Files.end()); + return It->second.SpelledTokens; +} + +std::string TokenBuffer::Mapping::str() const { + return llvm::formatv("spelled tokens: [{0},{1}), expanded tokens: [{2},{3})", + BeginSpelled, EndSpelled, BeginExpanded, EndExpanded); +} + +FileRange +TokenBuffer::offsetsForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const { + assert(!Spelled.empty()); + + FileRange R(*SourceMgr, Spelled.front().location(), + Spelled.back().endLocation()); + // Check the tokens are from an actual file. + assert(spelledTokens(R.file()).begin() <= Spelled.begin() && + Spelled.end() <= spelledTokens(R.file()).end()); + return R; +} + +llvm::Optional<llvm::ArrayRef<syntax::Token>> +TokenBuffer::spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const { + // Mapping an empty range is ambiguous in case of empty mappings at either end + // of the range, bail out in that case. + if (Expanded.empty()) + return llvm::None; + + // FIXME: also allow changes uniquely mapping to macro arguments. + + const syntax::Token *BeginSpelled; + const Mapping *BeginMapping; + std::tie(BeginSpelled, BeginMapping) = + spelledForExpandedToken(&Expanded.front()); + + const syntax::Token *LastSpelled; + const Mapping *LastMapping; + std::tie(LastSpelled, LastMapping) = + spelledForExpandedToken(&Expanded.back()); + + FileID FID = SourceMgr->getFileID(BeginSpelled->location()); + // FIXME: Handle multi-file changes by trying to map onto a common root. + if (FID != SourceMgr->getFileID(LastSpelled->location())) + return llvm::None; + + const MarkedFile &File = Files.find(FID)->second; + + // Do not allow changes that cross macro expansion boundaries. + unsigned BeginExpanded = Expanded.begin() - ExpandedTokens.data(); + unsigned EndExpanded = Expanded.end() - ExpandedTokens.data(); + if (BeginMapping && BeginMapping->BeginExpanded < BeginExpanded) + return llvm::None; + if (LastMapping && EndExpanded < LastMapping->EndExpanded) + return llvm::None; + // All is good, return the result. + return llvm::makeArrayRef( + BeginMapping ? File.SpelledTokens.data() + BeginMapping->BeginSpelled + : BeginSpelled, + LastMapping ? File.SpelledTokens.data() + LastMapping->EndSpelled + : LastSpelled + 1); +} + +std::vector<syntax::Token> syntax::tokenize(FileID FID, const SourceManager &SM, + const LangOptions &LO) { + std::vector<syntax::Token> Tokens; + IdentifierTable Identifiers(LO); + auto AddToken = [&](clang::Token T) { + // Fill the proper token kind for keywords, etc. + if (T.getKind() == tok::raw_identifier && !T.needsCleaning() && + !T.hasUCN()) { // FIXME: support needsCleaning and hasUCN cases. + clang::IdentifierInfo &II = Identifiers.get(T.getRawIdentifier()); + T.setIdentifierInfo(&II); + T.setKind(II.getTokenID()); + } + Tokens.push_back(syntax::Token(T)); + }; + + Lexer L(FID, SM.getBuffer(FID), SM, LO); + + clang::Token T; + while (!L.LexFromRawLexer(T)) + AddToken(T); + // 'eof' is only the last token if the input is null-terminated. Never store + // it, for consistency. + if (T.getKind() != tok::eof) + AddToken(T); + return Tokens; +} + +/// Fills in the TokenBuffer by tracing the run of a preprocessor. The +/// implementation tracks the tokens, macro expansions and directives coming +/// from the preprocessor and: +/// - for each token, figures out if it is a part of an expanded token stream, +/// spelled token stream or both. Stores the tokens appropriately. +/// - records mappings from the spelled to expanded token ranges, e.g. for macro +/// expansions. +/// FIXME: also properly record: +/// - #include directives, +/// - #pragma, #line and other PP directives, +/// - skipped pp regions, +/// - ... + +TokenCollector::TokenCollector(Preprocessor &PP) + : SourceMgr(PP.getSourceManager()), LangOpts(PP.getLangOpts()) { + // Collect the expanded token stream during preprocessing. + PP.setTokenWatcher([this](const clang::Token &T) { + DEBUG_WITH_TYPE("collect-tokens", + llvm::dbgs() + << "Token: " << syntax::Token(T).dumpForTests(SourceMgr) + << "\n" + + ); + Expanded.push_back(syntax::Token(T)); + }); +} + +/// Builds mappings and spelled tokens in the TokenBuffer based on the expanded +/// token stream. +class TokenCollector::Builder { +public: + Builder(std::vector<syntax::Token> Expanded, const SourceManager &SM, + const LangOptions &LangOpts) + : Result(SM), SM(SM), LangOpts(LangOpts) { + Result.ExpandedTokens = std::move(Expanded); + } + + TokenBuffer build() && { + buildSpelledTokens(); + + // Walk over expanded tokens and spelled tokens in parallel, building the + // mappings between those using source locations. + + // The 'eof' token is special, it is not part of spelled token stream. We + // handle it separately at the end. + assert(!Result.ExpandedTokens.empty()); + assert(Result.ExpandedTokens.back().kind() == tok::eof); + for (unsigned I = 0; I < Result.ExpandedTokens.size() - 1; ++I) { + // (!) I might be updated by the following call. + processExpandedToken(I); + } + + // 'eof' not handled in the loop, do it here. + assert(SM.getMainFileID() == + SM.getFileID(Result.ExpandedTokens.back().location())); + fillGapUntil(Result.Files[SM.getMainFileID()], + Result.ExpandedTokens.back().location(), + Result.ExpandedTokens.size() - 1); + Result.Files[SM.getMainFileID()].EndExpanded = Result.ExpandedTokens.size(); + + // Some files might have unaccounted spelled tokens at the end, add an empty + // mapping for those as they did not have expanded counterparts. + fillGapsAtEndOfFiles(); + + return std::move(Result); + } + +private: + /// Process the next token in an expanded stream and move corresponding + /// spelled tokens, record any mapping if needed. + /// (!) \p I will be updated if this had to skip tokens, e.g. for macros. + void processExpandedToken(unsigned &I) { + auto L = Result.ExpandedTokens[I].location(); + if (L.isMacroID()) { + processMacroExpansion(SM.getExpansionRange(L), I); + return; + } + if (L.isFileID()) { + auto FID = SM.getFileID(L); + TokenBuffer::MarkedFile &File = Result.Files[FID]; + + fillGapUntil(File, L, I); + + // Skip the token. + assert(File.SpelledTokens[NextSpelled[FID]].location() == L && + "no corresponding token in the spelled stream"); + ++NextSpelled[FID]; + return; + } + } + + /// Skipped expanded and spelled tokens of a macro expansion that covers \p + /// SpelledRange. Add a corresponding mapping. + /// (!) \p I will be the index of the last token in an expansion after this + /// function returns. + void processMacroExpansion(CharSourceRange SpelledRange, unsigned &I) { + auto FID = SM.getFileID(SpelledRange.getBegin()); + assert(FID == SM.getFileID(SpelledRange.getEnd())); + TokenBuffer::MarkedFile &File = Result.Files[FID]; + + fillGapUntil(File, SpelledRange.getBegin(), I); + + TokenBuffer::Mapping M; + // Skip the spelled macro tokens. + std::tie(M.BeginSpelled, M.EndSpelled) = + consumeSpelledUntil(File, SpelledRange.getEnd().getLocWithOffset(1)); + // Skip all expanded tokens from the same macro expansion. + M.BeginExpanded = I; + for (; I + 1 < Result.ExpandedTokens.size(); ++I) { + auto NextL = Result.ExpandedTokens[I + 1].location(); + if (!NextL.isMacroID() || + SM.getExpansionLoc(NextL) != SpelledRange.getBegin()) + break; + } + M.EndExpanded = I + 1; + + // Add a resulting mapping. + File.Mappings.push_back(M); + } + + /// Initializes TokenBuffer::Files and fills spelled tokens and expanded + /// ranges for each of the files. + void buildSpelledTokens() { + for (unsigned I = 0; I < Result.ExpandedTokens.size(); ++I) { + auto FID = + SM.getFileID(SM.getExpansionLoc(Result.ExpandedTokens[I].location())); + auto It = Result.Files.try_emplace(FID); + TokenBuffer::MarkedFile &File = It.first->second; + + File.EndExpanded = I + 1; + if (!It.second) + continue; // we have seen this file before. + + // This is the first time we see this file. + File.BeginExpanded = I; + File.SpelledTokens = tokenize(FID, SM, LangOpts); + } + } + + /// Consumed spelled tokens until location L is reached (token starting at L + /// is not included). Returns the indicies of the consumed range. + std::pair</*Begin*/ unsigned, /*End*/ unsigned> + consumeSpelledUntil(TokenBuffer::MarkedFile &File, SourceLocation L) { + assert(L.isFileID()); + FileID FID; + unsigned Offset; + std::tie(FID, Offset) = SM.getDecomposedLoc(L); + + // (!) we update the index in-place. + unsigned &SpelledI = NextSpelled[FID]; + unsigned Before = SpelledI; + for (; SpelledI < File.SpelledTokens.size() && + SM.getFileOffset(File.SpelledTokens[SpelledI].location()) < Offset; + ++SpelledI) { + } + return std::make_pair(Before, /*After*/ SpelledI); + }; + + /// Consumes spelled tokens until location \p L is reached and adds a mapping + /// covering the consumed tokens. The mapping will point to an empty expanded + /// range at position \p ExpandedIndex. + void fillGapUntil(TokenBuffer::MarkedFile &File, SourceLocation L, + unsigned ExpandedIndex) { + unsigned BeginSpelledGap, EndSpelledGap; + std::tie(BeginSpelledGap, EndSpelledGap) = consumeSpelledUntil(File, L); + if (BeginSpelledGap == EndSpelledGap) + return; // No gap. + TokenBuffer::Mapping M; + M.BeginSpelled = BeginSpelledGap; + M.EndSpelled = EndSpelledGap; + M.BeginExpanded = M.EndExpanded = ExpandedIndex; + File.Mappings.push_back(M); + }; + + /// Adds empty mappings for unconsumed spelled tokens at the end of each file. + void fillGapsAtEndOfFiles() { + for (auto &F : Result.Files) { + unsigned Next = NextSpelled[F.first]; + if (F.second.SpelledTokens.size() == Next) + continue; // All spelled tokens are accounted for. + + // Record a mapping for the gap at the end of the spelled tokens. + TokenBuffer::Mapping M; + M.BeginSpelled = Next; + M.EndSpelled = F.second.SpelledTokens.size(); + M.BeginExpanded = F.second.EndExpanded; + M.EndExpanded = F.second.EndExpanded; + + F.second.Mappings.push_back(M); + } + } + + TokenBuffer Result; + /// For each file, a position of the next spelled token we will consume. + llvm::DenseMap<FileID, unsigned> NextSpelled; + const SourceManager &SM; + const LangOptions &LangOpts; +}; + +TokenBuffer TokenCollector::consume() && { + return Builder(std::move(Expanded), SourceMgr, LangOpts).build(); +} + +std::string syntax::Token::str() const { + return llvm::formatv("Token({0}, length = {1})", tok::getTokenName(kind()), + length()); +} + +std::string syntax::Token::dumpForTests(const SourceManager &SM) const { + return llvm::formatv("{0} {1}", tok::getTokenName(kind()), text(SM)); +} + +std::string TokenBuffer::dumpForTests() const { + auto PrintToken = [this](const syntax::Token &T) -> std::string { + if (T.kind() == tok::eof) + return "<eof>"; + return T.text(*SourceMgr); + }; + + auto DumpTokens = [this, &PrintToken](llvm::raw_ostream &OS, + llvm::ArrayRef<syntax::Token> Tokens) { + if (Tokens.size() == 1) { + assert(Tokens[0].kind() == tok::eof); + OS << "<empty>"; + return; + } + OS << Tokens[0].text(*SourceMgr); + for (unsigned I = 1; I < Tokens.size(); ++I) { + if (Tokens[I].kind() == tok::eof) + continue; + OS << " " << PrintToken(Tokens[I]); + } + }; + + std::string Dump; + llvm::raw_string_ostream OS(Dump); + + OS << "expanded tokens:\n" + << " "; + DumpTokens(OS, ExpandedTokens); + OS << "\n"; + + std::vector<FileID> Keys; + for (auto F : Files) + Keys.push_back(F.first); + llvm::sort(Keys); + + for (FileID ID : Keys) { + const MarkedFile &File = Files.find(ID)->second; + auto *Entry = SourceMgr->getFileEntryForID(ID); + if (!Entry) + continue; // Skip builtin files. + OS << llvm::formatv("file '{0}'\n", Entry->getName()) + << " spelled tokens:\n" + << " "; + DumpTokens(OS, File.SpelledTokens); + OS << "\n"; + + if (File.Mappings.empty()) { + OS << " no mappings.\n"; + continue; + } + OS << " mappings:\n"; + for (auto &M : File.Mappings) { + OS << llvm::formatv( + " ['{0}'_{1}, '{2}'_{3}) => ['{4}'_{5}, '{6}'_{7})\n", + PrintToken(File.SpelledTokens[M.BeginSpelled]), M.BeginSpelled, + M.EndSpelled == File.SpelledTokens.size() + ? "<eof>" + : PrintToken(File.SpelledTokens[M.EndSpelled]), + M.EndSpelled, PrintToken(ExpandedTokens[M.BeginExpanded]), + M.BeginExpanded, PrintToken(ExpandedTokens[M.EndExpanded]), + M.EndExpanded); + } + } + return OS.str(); +} Index: clang/lib/Tooling/Syntax/CMakeLists.txt =================================================================== --- /dev/null +++ clang/lib/Tooling/Syntax/CMakeLists.txt @@ -0,0 +1,10 @@ +set(LLVM_LINK_COMPONENTS Support) + +add_clang_library(clangToolingSyntax + Tokens.cpp + + LINK_LIBS + clangBasic + clangFrontend + clangLex + ) Index: clang/lib/Tooling/CMakeLists.txt =================================================================== --- clang/lib/Tooling/CMakeLists.txt +++ clang/lib/Tooling/CMakeLists.txt @@ -7,6 +7,7 @@ add_subdirectory(Inclusions) add_subdirectory(Refactoring) add_subdirectory(ASTDiff) +add_subdirectory(Syntax) add_clang_library(clangTooling AllTUsExecution.cpp Index: clang/include/clang/Tooling/Syntax/Tokens.h =================================================================== --- /dev/null +++ clang/include/clang/Tooling/Syntax/Tokens.h @@ -0,0 +1,296 @@ +//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Record tokens that a preprocessor emits and define operations to map between +// the tokens written in a file and tokens produced by the preprocessor. +// +// When running the compiler, there are two token streams we are interested in: +// - "spelled" tokens directly correspond to a substring written in some +// source file. +// - "expanded" tokens represent the result of preprocessing, parses consumes +// this token stream to produce the AST. +// +// Expanded tokens correspond directly to locations found in the AST, allowing +// to find subranges of the token stream covered by various AST nodes. Spelled +// tokens correspond directly to the source code written by the user. +// +// To allow composing these two use-cases, we also define operations that map +// between expanded and spelled tokens that produced them (macro calls, +// directives, etc). +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H +#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H + +#include "clang/Basic/FileManager.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/raw_ostream.h" +#include <cstdint> +#include <tuple> + +namespace clang { +class Preprocessor; + +namespace syntax { + +/// A token coming directly from a file or from a macro invocation. Has just +/// enough information to locate the token in the source code. +/// Can represent both expanded and spelled tokens. +class Token { +public: + Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind) + : Location(Location), Length(Length), Kind(Kind) {} + /// EXPECTS: clang::Token is not an annotation token. + explicit Token(const clang::Token &T); + + tok::TokenKind kind() const { return Kind; } + /// Location of the first character of a token. + SourceLocation location() const { return Location; } + /// Location right after the last character of a token. + SourceLocation endLocation() const { + return Location.getLocWithOffset(Length); + } + unsigned length() const { return Length; } + + /// Get the substring covered by the token. Note that will include all + /// digraphs, newline continuations, etc. E.g. tokens for 'int' and + /// in\ + /// t + /// both have the same kind tok::kw_int, but results of text() are different. + llvm::StringRef text(const SourceManager &SM) const; + + std::string dumpForTests(const SourceManager &SM) const; + /// For debugging purposes. + std::string str() const; + +private: + SourceLocation Location; + unsigned Length; + tok::TokenKind Kind; +}; +/// For debugging purposes. Equivalent to a call to Token::str(). +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T); + +/// A half-open range inside a particular file, the start offset is included and +/// the end offset is excluded from the range. +struct FileRange { + /// EXPECTS: File.isValid() && Begin <= End. + FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset); + /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(). + FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length); + /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files + /// are the same. + FileRange(const SourceManager &SM, SourceLocation BeginLoc, SourceLocation EndLoc); + + FileID file() const { return File; } + /// Start is a start offset (inclusive) in the corresponding file. + unsigned beginOffset() const { return Begin; } + /// End offset (exclusive) in the corresponding file. + unsigned endOffset() const { return End; } + + unsigned length() const { return End - Begin; } + + /// Gets the substring that this FileRange refers to. + llvm::StringRef text(const SourceManager &SM) const; + + friend bool operator==(const FileRange &L, const FileRange &R) { + return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End); + } + friend bool operator!=(const FileRange &L, const FileRange &R) { + return !(L == R); + } + +private: + FileID File; + unsigned Begin; + unsigned End; +}; + +/// For debugging purposes. +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); + +/// A list of tokens obtained by preprocessing a text buffer and operations to +/// map between the expanded and spelled tokens, i.e. TokenBuffer has +/// information about two token streams: +/// 1. Expanded tokens: tokens produced by the preprocessor after all macro +/// replacements, +/// 2. Spelled tokens: corresponding directly to the source code of a file +/// before any macro replacements occurred. +/// Here's an example to illustrate a difference between those two: +/// #define FOO 10 +/// int a = FOO; +/// +/// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}. +/// Expanded tokens are {'int','a','=','10',';','eof'}. +/// +/// Note that the expanded token stream has a tok::eof token at the end, the +/// spelled tokens never store a 'eof' token. +/// +/// The full list expanded tokens can be obtained with expandedTokens(). Spelled +/// tokens for each of the files can be obtained via spelledTokens(FileID). +/// +/// To map between the expanded and spelled tokens use findSpelledByExpanded(). +/// +/// To build a token buffer use the TokenCollector class. You can also compute +/// the spelled tokens of a file using the tokenize() helper. +/// +/// FIXME: allow to map from spelled to expanded tokens when use-case shows up. +class TokenBuffer { +public: + TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {} + /// All tokens produced by the preprocessor after all macro replacements, + /// directives, etc. Source locations found in the clang AST will always + /// point to one of these tokens. + /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split + /// into two '>' tokens by the parser. However, TokenBuffer currently + /// keeps it as a single '>>' token. + llvm::ArrayRef<syntax::Token> expandedTokens() const { + return ExpandedTokens; + } + + /// Find the subrange of spelled tokens that produced the corresponding \p + /// Expanded tokens. + /// + /// EXPECTS: \p Expanded is a subrange of expandedTokens(). + /// + /// Will fail if the expanded tokens do not correspond to a + /// sequence of spelled tokens. E.g. for the following example: + /// + /// #define FIRST f1 f2 f3 + /// #define SECOND s1 s2 s3 + /// + /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c + /// + /// the results would be: + /// expanded => spelled + /// ------------------------ + /// a => a + /// s1 s2 s3 => SECOND + /// a f1 f2 f3 => a FIRST + /// a f1 => can't map + /// s1 s2 => can't map + /// + /// If \p Expanded is empty, the returned value is llvm::None. + /// Complexity is logarithmic. + llvm::Optional<llvm::ArrayRef<syntax::Token>> + spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const; + + /// Returns the text range, corresponding to a sequence of spelled tokens. + /// EXPECTS: \p Spelled is not empty. + /// EXPECTS: \p Spelled is a subrange of spelledTokens(F) for some file F. + FileRange offsetsForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; + + /// Lexed tokens of a file before preprocessing. E.g. for the following input + /// #define DECL(name) int name = 10 + /// DECL(a); + /// spelledTokens() returns {"#", "define", "DECL", "(", "name", ")", "eof"}. + /// FIXME: we do not yet store tokens of directives, like #include, #define, + /// #pragma, etc. + llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const; + + std::string dumpForTests() const; + +private: + /// Describes a mapping between a continuous subrange of spelled tokens and + /// expanded tokens. Represents macro expansions, preprocessor directives, + /// conditionally disabled pp regions, etc. + /// #define FOO 1+2 + /// #define BAR(a) a + 1 + /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}. + /// BAR(1) // invocation #2, tokens = {'a', '+', '1'}, + /// macroTokens = {'BAR', '(', '1', ')'}. + struct Mapping { + // Positions in the corresponding spelled token stream. The corresponding + // range is never empty. + unsigned BeginSpelled = 0; + unsigned EndSpelled = 0; + // Positions in the expanded token stream. The corresponding range can be + // empty. + unsigned BeginExpanded = 0; + unsigned EndExpanded = 0; + + /// For debugging purposes. + std::string str() const; + }; + /// Spelled tokens of the file with information about the subranges. + struct MarkedFile { + /// Lexed, but not preprocessed, tokens of the file. These map directly to + /// text in the corresponding files and include tokens of all preprocessor + /// directives. + /// FIXME: spelled tokens don't change across FileID that map to the same + /// FileEntry. We could consider deduplicating them to save memory. + std::vector<syntax::Token> SpelledTokens; + /// A sorted list to convert between the spelled and expanded token streams. + std::vector<Mapping> Mappings; + /// The first expanded token produced for this FileID. + unsigned BeginExpanded = 0; + unsigned EndExpanded = 0; + }; + + friend class TokenCollector; + + /// Maps a single expanded token to its spelled counterpart or a mapping that + /// produced it. + std::pair<const syntax::Token *, const Mapping *> + spelledForExpandedToken(const syntax::Token *Expanded) const; + + /// Token stream produced after preprocessing, conceputally this captures the + /// same stream as 'clang -E' (excluding the preprocessor directives like + /// #file, etc.). + std::vector<syntax::Token> ExpandedTokens; + llvm::DenseMap<FileID, MarkedFile> Files; + // The value is never null, pointer instead of reference to avoid disabling + // implicit assignment operator. + const SourceManager *SourceMgr; +}; + +/// Lex the text buffer, corresponding to \p FID, in raw mode and record the +/// resulting spelled tokens. Does minimal post-processing on raw identifiers, +/// setting the appropriate token kind (instead of the raw_identifier reported +/// by lexer in raw mode). This is a very low-level function, most users should +/// prefer to use TokenCollector. Lexing in raw mode produces wildly different +/// results from what one might expect when running a C++ frontend, e.g. +/// preprocessor does not run at all. +/// The result will *not* have a 'eof' token at the end. +std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM, + const LangOptions &LO); + +/// Collects tokens for the main file while running the frontend action. An +/// instance of this object should be created on +/// FrontendAction::BeginSourceFile() and the results should be consumed after +/// FrontendAction::Execute() finishes. +class TokenCollector { +public: + /// Adds the hooks to collect the tokens. Should be called before the + /// preprocessing starts, i.e. as a part of BeginSourceFile() or + /// CreateASTConsumer(). + TokenCollector(Preprocessor &P); + + /// Finalizes token collection. Should be called after preprocessing is + /// finished, i.e. after running Execute(). + LLVM_NODISCARD TokenBuffer consume() &&; + +private: + class Builder; + std::vector<syntax::Token> Expanded; + const SourceManager &SourceMgr; + const LangOptions &LangOpts; +}; + +} // namespace syntax +} // namespace clang + +#endif
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits