ilya-biryukov created this revision. ilya-biryukov added a reviewer: gribozavr. Herald added subscribers: jdoerfert, mgorny. Herald added a project: clang.
TokenBuffer stores the list of tokens for a file obtained after preprocessing. This is a base building block for syntax trees, see [1] for the full proposal on syntax trees. This commits also starts a new sub-library of ClangTooling, which would be the home for the syntax trees and syntax-tree-based refactoring utilities. [1]: https://lists.llvm.org/pipermail/cfe-dev/2019-February/061414.html Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D59887 Files: clang/include/clang/Tooling/Syntax/TokenBuffer.h clang/lib/Tooling/CMakeLists.txt clang/lib/Tooling/Syntax/CMakeLists.txt clang/lib/Tooling/Syntax/TokenBuffer.cpp clang/unittests/Tooling/CMakeLists.txt clang/unittests/Tooling/Syntax/CMakeLists.txt clang/unittests/Tooling/Syntax/TokenBufferTest.cpp
Index: clang/unittests/Tooling/Syntax/TokenBufferTest.cpp =================================================================== --- /dev/null +++ clang/unittests/Tooling/Syntax/TokenBufferTest.cpp @@ -0,0 +1,471 @@ +//===- TokenBufferTest.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/TokenBuffer.h" +#include "clang/AST/ASTConsumer.h" +#include "clang/AST/Expr.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticIDs.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/FileSystemOptions.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.def" +#include "clang/Basic/TokenKinds.h" +#include "clang/Frontend/CompilerInstance.h" +#include "clang/Frontend/FrontendAction.h" +#include "clang/Frontend/Utils.h" +#include "clang/Lex/Lexer.h" +#include "clang/Lex/PreprocessorOptions.h" +#include "clang/Lex/Token.h" +#include "clang/Tooling/Tooling.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/IntrusiveRefCntPtr.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Support/raw_os_ostream.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Testing/Support/Annotations.h" +#include <cassert> +#include <gmock/gmock.h> +#include <gtest/gtest.h> +#include <memory> +#include <ostream> +#include <string> + +using namespace clang; +using namespace clang::syntax; + +using ::testing::AllOf; +using ::testing::Contains; +using ::testing::ElementsAre; +using ::testing::IsEmpty; +using ::testing::Matcher; +using ::testing::Pointwise; + +// Debug printers. +// FIXME: This should live somewhere else or be implemented as 'operator +// <<(raw_ostream&, T)'. +namespace clang { +namespace tok { +inline void PrintTo(TokenKind K, std::ostream *OS) { + *OS << tok::getTokenName(K); +} +} // namespace tok +namespace syntax { +inline void PrintTo(const syntax::Token &T, std::ostream *OS) { + PrintTo(T.kind(), OS); + OS->flush(); +} +} // namespace syntax +} // namespace clang + +namespace { +// Matchers for clang::Token. +MATCHER_P(Kind, K, "") { return arg.kind() == K; } +MATCHER_P2(HasText, Text, SourceMgr, "") { + return arg.text(*SourceMgr) == Text; +} +MATCHER_P2(IsIdent, Text, SourceMgr, "") { + return arg.kind() == tok::identifier && arg.text(*SourceMgr) == Text; +} +/// Checks the start and end location of a token are equal to SourceRng. +MATCHER_P(RangeIs, SourceRng, "") { + return arg.location() == SourceRng.first && + arg.endLocation() == SourceRng.second; +} +/// Checks the passed tuple has two similar tokens, i.e. both are of the same +/// kind and have the same text if they are identifiers. +MATCHER_P(IsSameToken, SourceMgr, "") { + auto &L = std::get<0>(arg); + auto &R = std::get<1>(arg); + if (L.kind() != R.kind()) + return false; + return L.text(*SourceMgr) == L.text(*SourceMgr); +} + +class TokenBufferTest : public ::testing::Test { +public: + /// Run the clang frontend, collect the preprocessed tokens from the frontend + /// invocation and store them in this->Tokens. + /// This also clears SourceManager before running the compiler. + void recordTokens(llvm::StringRef Code) { + class RecordTokens : public ASTFrontendAction { + public: + explicit RecordTokens(TokenBuffer &Result) : Result(Result) {} + + bool BeginSourceFileAction(CompilerInstance &CI) override { + assert(!Collector && "expected only a single call to BeginSourceFile"); + Collector.emplace(CI.getPreprocessor()); + return true; + } + void EndSourceFileAction() override { + assert(Collector && "BeginSourceFileAction was never called"); + Result = std::move(*Collector).consume(); + } + + std::unique_ptr<ASTConsumer> + CreateASTConsumer(CompilerInstance &CI, StringRef InFile) override { + return llvm::make_unique<ASTConsumer>(); + } + + private: + TokenBuffer &Result; + llvm::Optional<TokenCollector> Collector; + }; + + constexpr const char *FileName = "./input.cpp"; + FS->addFile(FileName, time_t(), llvm::MemoryBuffer::getMemBufferCopy("")); + // Prepare to run a compiler. + std::vector<const char *> Args = {"tok-test", "-std=c++03", "-fsyntax-only", + FileName}; + auto CI = createInvocationFromCommandLine(Args, Diags, FS); + assert(CI); + CI->getFrontendOpts().DisableFree = false; + CI->getPreprocessorOpts().addRemappedFile( + FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release()); + LangOpts = *CI->getLangOpts(); + CompilerInstance Compiler; + Compiler.setInvocation(std::move(CI)); + if (!Diags->getClient()) + Diags->setClient(new IgnoringDiagConsumer); + Compiler.setDiagnostics(Diags.get()); + Compiler.setFileManager(FileMgr.get()); + Compiler.setSourceManager(SourceMgr.get()); + + this->Buffer = TokenBuffer(); + RecordTokens Recorder(this->Buffer); + ASSERT_TRUE(Compiler.ExecuteAction(Recorder)) + << "failed to run the frontend"; + } + + /// Run syntax::tokenize() and return the results. + TokenBuffer tokenize(llvm::StringRef Text) { + // Null-terminate so that we always see 'tok::eof' at the end. + std::string NullTerminated = Text.str(); + auto FID = SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy( + StringRef(NullTerminated.data(), NullTerminated.size() + 1))); + return syntax::tokenize(FID, *SourceMgr, LangOpts); + } + + /// Checks that lexing \p ExpectedText in raw mode would produce the same + /// token stream as the one stored in this->Buffer.tokens(). + void checkTokens(llvm::StringRef ExpectedText) { + auto TokenizedCode = tokenize(ExpectedText); + std::vector<syntax::Token> ExpectedTokens = TokenizedCode.tokens(); + EXPECT_THAT(std::vector<syntax::Token>(Buffer.tokens()), + Pointwise(IsSameToken(), ExpectedTokens)) + << "\texpected tokens: " << ExpectedText; + } + + struct ExpectedExpansion { + ExpectedExpansion(std::string From, std::string To, + llvm::Optional<llvm::Range> Range = llvm::None) + : From(std::move(From)), To(std::move(To)), Range(Range) {} + /// A textual representation of the macro tokens. + std::string From; + /// A textual representation of the expansion result. + std::string To; + /// A text range the expansion points to. + llvm::Optional<llvm::Range> Range; + }; + /// Checks the expansions in this->Buffer.macroExpansions() match the \p + /// Expected ones. + void checkExpansions(llvm::ArrayRef<ExpectedExpansion> Expected) { + auto Actual = Buffer.expansions(); + ASSERT_EQ(Actual.size(), Expected.size()); + + for (unsigned I = 0; I < Actual.size(); ++I) { + auto &A = Actual[I]; + auto &E = Expected[I]; + + if (E.Range) + ASSERT_EQ(A.macroRange(Buffer, *SourceMgr), + (std::pair<unsigned, unsigned>(E.Range->Begin, E.Range->End))) + << "\trange does not match"; + + ASSERT_THAT( + std::vector<syntax::Token>(A.macroTokens(Buffer)), + Pointwise(IsSameToken(), std::vector<syntax::Token>( + tokenize(E.From).tokens().drop_back()))) + << "\tmacro tokens do not match, expected " << E.From; + + ASSERT_THAT( + std::vector<syntax::Token>(A.tokens(Buffer)), + Pointwise(IsSameToken(), std::vector<syntax::Token>( + tokenize(E.To).tokens().drop_back()))) + << "\ttokens after expansion do not match, expected " << E.To; + } + } + + // Specialized versions of matchers that rely on SourceManager. + Matcher<syntax::Token> IsIdent(std::string Text) const { + return ::IsIdent(Text, SourceMgr.get()); + } + Matcher<syntax::Token> HasText(std::string Text) const { + return ::HasText(Text, SourceMgr.get()); + } + Matcher<syntax::Token> RangeIs(llvm::Range R) const { + std::pair<SourceLocation, SourceLocation> Ls; + Ls.first = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID()) + .getLocWithOffset(R.Begin); + Ls.second = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID()) + .getLocWithOffset(R.End); + return ::RangeIs(Ls); + } + Matcher<std::tuple<const syntax::Token &, const syntax::Token &>> + IsSameToken() const { + return ::IsSameToken(SourceMgr.get()); + } + + // Data fields. + llvm::IntrusiveRefCntPtr<DiagnosticsEngine> Diags = + new DiagnosticsEngine(new DiagnosticIDs, new DiagnosticOptions); + IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS = + new llvm::vfs::InMemoryFileSystem; + llvm::IntrusiveRefCntPtr<FileManager> FileMgr = + new FileManager(FileSystemOptions(), FS); + llvm::IntrusiveRefCntPtr<SourceManager> SourceMgr = + new SourceManager(*Diags, *FileMgr); + /// Contains last result of calling recordTokens(). + TokenBuffer Buffer; + /// Contains options from last run of recordTokens(). + LangOptions LangOpts; +}; + +TEST_F(TokenBufferTest, RawMode) { + EXPECT_THAT(tokenize("int main() {}").tokens(), + ElementsAre(Kind(tok::kw_int), IsIdent("main"), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace), Kind(tok::r_brace), + Kind(tok::eof))); + // Comments are ignored for now. + EXPECT_THAT(tokenize("/* foo */int a; // more comments").tokens(), + ElementsAre(Kind(tok::kw_int), IsIdent("a"), Kind(tok::semi), + Kind(tok::eof))); +} + +TEST_F(TokenBufferTest, Basic) { + recordTokens("int main() {}"); + EXPECT_THAT(Buffer.tokens(), + ElementsAre(Kind(tok::kw_int), IsIdent("main"), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace), Kind(tok::r_brace), + Kind(tok::eof))); + // All kinds of whitespace are ignored. + recordTokens("\t\n int\t\n main\t\n (\t\n )\t\n{\t\n }\t\n"); + EXPECT_THAT(Buffer.tokens(), + ElementsAre(Kind(tok::kw_int), IsIdent("main"), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace), Kind(tok::r_brace), + Kind(tok::eof))); + + llvm::Annotations Code(R"cpp( + $r1[[int]] $r2[[a]] $r3[[=]] $r4[["foo bar baz"]] $r5[[;]] + )cpp"); + recordTokens(Code.code()); + EXPECT_THAT( + Buffer.tokens(), + ElementsAre(AllOf(Kind(tok::kw_int), RangeIs(Code.range("r1"))), + AllOf(Kind(tok::identifier), RangeIs(Code.range("r2"))), + AllOf(Kind(tok::equal), RangeIs(Code.range("r3"))), + AllOf(Kind(tok::string_literal), RangeIs(Code.range("r4"))), + AllOf(Kind(tok::semi), RangeIs(Code.range("r5"))), + Kind(tok::eof))); +} + +TEST_F(TokenBufferTest, MacroDirectives) { + // Macro directives are not stored anywhere at the moment. + recordTokens(R"cpp( + #define FOO a + #include "unresolved_file.h" + #undef FOO + #ifdef X + #else + #endif + #ifndef Y + #endif + #if 1 + #elif 2 + #else + #endif + #pragma once + #pragma something lalala + + int a; + )cpp"); + + checkTokens("int a;"); + EXPECT_THAT(Buffer.expansions(), IsEmpty()); + EXPECT_THAT(Buffer.macroTokens(), IsEmpty()); +} + +TEST_F(TokenBufferTest, MacroExpansions) { + // A simple macro definition and expansion. + llvm::Annotations Code(R"cpp( + #define INT int const + [[INT]] a; + )cpp"); + recordTokens(Code.code()); + + checkTokens("int const a;"); + checkExpansions({{"INT", "int const", Code.range()}}); + + // A simple functional macro invocation. + Code = llvm::Annotations(R"cpp( + #define INT(a) const int + [[INT(10+10)]] a; + )cpp"); + recordTokens(Code.code()); + + checkTokens("const int a;"); + checkExpansions({{"INT(10+10)", "const int", Code.range()}}); + + // Recursive macro expansions. + Code = llvm::Annotations(R"cpp( + #define ID(X) X + #define INT int const + [[ID(ID(INT))]] a; + )cpp"); + recordTokens(Code.code()); + + checkTokens("int const a;"); + checkExpansions({{"ID(ID(INT))", "int const", Code.range()}}); + + // Empty macro expansions. + Code = llvm::Annotations(R"cpp( + #define EMPTY + #define EMPTY_FUNC(X) + $m[[EMPTY]] + $f[[EMPTY_FUNC(1+2+3)]] + )cpp"); + recordTokens(Code.code()); + + checkTokens(""); + checkExpansions({{"EMPTY", "", Code.range("m")}, + {"EMPTY_FUNC(1+2+3)", "", Code.range("f")}}); +} + +TEST_F(TokenBufferTest, SpecialTokens) { + // Tokens coming from concatenations. + recordTokens(R"cpp( + #define CONCAT(a, b) a ## b + int a = CONCAT(1, 2); + )cpp"); + checkTokens("int a = 12;"); + // Multi-line tokens with slashes at the end. + recordTokens("i\\\nn\\\nt"); + EXPECT_THAT(Buffer.tokens(), + ElementsAre(AllOf(Kind(tok::kw_int), HasText("i\\\nn\\\nt")), + Kind(tok::eof))); + // FIXME: test tokens with digraphs and UCN identifiers. +} + +TEST_F(TokenBufferTest, LateBoundTokens) { + // The parser eventually breaks the first '>>' into two tokens ('>' and '>'), + // but we chooses to record them as a single token (for now). + llvm::Annotations Code(R"cpp( + template <class T> + struct foo { int a; }; + int bar = foo<foo<int$br[[>>]]().a; + int baz = 10 $op[[>>]] 2; + )cpp"); + recordTokens(Code.code()); + EXPECT_THAT(std::vector<syntax::Token>(Buffer.tokens()), + AllOf(Contains(AllOf(Kind(tok::greatergreater), + RangeIs(Code.range("br")))), + Contains(AllOf(Kind(tok::greatergreater), + RangeIs(Code.range("op")))))); +} + +TEST_F(TokenBufferTest, DelayedParsing) { + llvm::StringLiteral Code = R"cpp( + struct Foo { + int method() { + // Parser will visit method bodies and initializers multiple time, but + // TokenBuffer should only record the first walk over the tokens; + return 100; + } + int a = 10; + int b = 20; + + struct Subclass { + void foo() { + Foo().method(); + } + }; + }; + )cpp"; + recordTokens(Code); + // Checks that lexing in raw mode produces the same results, hence we're not + // recording any tokens twice and the order is the same. + checkTokens(Code); +} + +TEST_F(TokenBufferTest, Offsets) { + llvm::Annotations Code(""); + auto OfKind = [this](tok::TokenKind K) { + auto It = llvm::find_if( + Buffer.tokens(), [K](const syntax::Token &T) { return T.kind() == K; }); + assert(It != Buffer.tokens().end()); + return It; + }; + auto Range = [&Code](llvm::StringRef Name) { + auto R = Code.range(Name); + return std::pair<unsigned, unsigned>(R.Begin, R.End); + }; + + Code = llvm::Annotations(R"cpp( + $all[[int $a[[a]] = $numbers[[100 + 200]];]] + )cpp"); + + recordTokens(Code.code()); + EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::kw_int), + std::next(OfKind(tok::semi)), *SourceMgr), + Range("all")); + EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::identifier), + std::next(OfKind(tok::identifier)), + *SourceMgr), + Range("a")); + EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::numeric_constant), + OfKind(tok::semi), *SourceMgr), + Range("numbers")); + + Code = llvm::Annotations(R"cpp( + #define ID(a) a + #define NUMBERS 100 + 200 + $all[[ID(int) $a[[ID(a)]] = $numbers[[NUMBERS]];]] + )cpp"); + recordTokens(Code.code()); + EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::kw_int), + std::next(OfKind(tok::semi)), *SourceMgr), + Range("all")); + EXPECT_EQ(*Buffer.toOffsetRange(OfKind(tok::identifier), + std::next(OfKind(tok::identifier)), + *SourceMgr), + Range("a")); + EXPECT_EQ(*Buffer.toOffsetRange(OfKind(tok::numeric_constant), + OfKind(tok::semi), *SourceMgr), + Range("numbers")); + // Ranges not fully covering macro expansions should fail. + EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::numeric_constant), + std::next(OfKind(tok::numeric_constant)), + *SourceMgr), + llvm::None); + EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::plus), + std::next(OfKind(tok::plus)), *SourceMgr), + llvm::None); +} + +} // namespace Index: clang/unittests/Tooling/Syntax/CMakeLists.txt =================================================================== --- /dev/null +++ clang/unittests/Tooling/Syntax/CMakeLists.txt @@ -0,0 +1,20 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Support + ) + +add_clang_unittest(TokenBufferTest + TokenBufferTest.cpp +) + +target_link_libraries(TokenBufferTest + PRIVATE + clangAST + clangBasic + clangFrontend + clangLex + clangSerialization + clangTooling + clangToolingSyntax + LLVMTestingSupport + ) Index: clang/unittests/Tooling/CMakeLists.txt =================================================================== --- clang/unittests/Tooling/CMakeLists.txt +++ clang/unittests/Tooling/CMakeLists.txt @@ -67,3 +67,6 @@ clangToolingInclusions clangToolingRefactor ) + + +add_subdirectory(Syntax) Index: clang/lib/Tooling/Syntax/TokenBuffer.cpp =================================================================== --- /dev/null +++ clang/lib/Tooling/Syntax/TokenBuffer.cpp @@ -0,0 +1,387 @@ +//===- TokenBuffer.cpp - store tokens of preprocessed files ---*- C++ -*-=====// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "clang/Tooling/Syntax/TokenBuffer.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.def" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Optional.h" +#include "llvm/Support/FormatVariadic.h" +#include <iterator> + +using namespace clang; +using namespace clang::syntax; + +syntax::Token::Token(const clang::Token &T) + : Token(T.getLocation(), T.getLength(), T.getKind()) { + assert(!T.isAnnotation()); +} +llvm::StringRef syntax::Token::text(const SourceManager &SM) const { + bool Invalid = false; + const char *Start = SM.getCharacterData(location(), &Invalid); + assert(!Invalid); + return llvm::StringRef(Start, length()); +} + +TokenBuffer syntax::tokenize(FileID FID, const SourceManager &SM, + const LangOptions &LO) { + std::vector<syntax::Token> Tokens; + IdentifierTable Identifiers(LO); + auto AddToken = [&](clang::Token T) { + if (T.getKind() == tok::raw_identifier && !T.needsCleaning() && + !T.hasUCN()) { // FIXME: support needsCleaning and hasUCN cases. + clang::IdentifierInfo &II = Identifiers.get(T.getRawIdentifier()); + T.setIdentifierInfo(&II); + T.setKind(II.getTokenID()); + } + Tokens.push_back(syntax::Token(T)); + }; + + Lexer L(FID, SM.getBuffer(FID), SM, LO); + + clang::Token T; + while (!L.LexFromRawLexer(T)) + AddToken(T); + AddToken(T); + + return TokenBuffer(std::move(Tokens)); +} + +class TokenCollector::Callbacks : public PPCallbacks { +public: + Callbacks(const SourceManager &SM, const LangOptions &LO, TokenBuffer &Result) + : Result(Result), SM(SM), LO(LO) {} + + void FileChanged(SourceLocation Loc, FileChangeReason Reason, + SrcMgr::CharacteristicKind FileType, + FileID PrevFID) override { + assert(Loc.isFileID()); + InsideMainFile = SM.getFileID(Loc) == SM.getMainFileID(); + flushCurrentExpansion(); + } + + void MacroDefined(const clang::Token &MacroNameTok, + const MacroDirective *MD) override { + flushCurrentExpansion(); + handleMacroDirective(MacroNameTok.getLocation(), /*AnchorDiff=*/2); + } + + void MacroUndefined(const clang::Token &MacroNameTok, + const MacroDefinition &MD, + const MacroDirective *Undef) override { + flushCurrentExpansion(); + handleMacroDirective(MacroNameTok.getLocation(), /*AnchorDiff=*/2); + } + + void InclusionDirective(SourceLocation HashLoc, + const clang::Token &IncludeTok, StringRef FileName, + bool IsAngled, CharSourceRange FilenameRange, + const FileEntry *File, StringRef SearchPath, + StringRef RelativePath, const Module *Imported, + SrcMgr::CharacteristicKind FileType) override { + flushCurrentExpansion(); + handleMacroDirective(IncludeTok.getLocation(), /*AnchorDiff=*/1); + } + + void If(SourceLocation Loc, SourceRange ConditionRange, + ConditionValueKind ConditionValue) override { + flushCurrentExpansion(); + handleMacroDirective(Loc, /*AnchorDiff=*/1); + } + + void Elif(SourceLocation Loc, SourceRange ConditionRange, + ConditionValueKind ConditionValue, SourceLocation IfLoc) override { + flushCurrentExpansion(); + handleMacroDirective(Loc, /*AnchorDiff=*/1); + } + + void Ifdef(SourceLocation Loc, const clang::Token &MacroNameTok, + const MacroDefinition &MD) override { + flushCurrentExpansion(); + handleMacroDirective(Loc, /*AnchorDiff=*/1); + } + + void Ifndef(SourceLocation Loc, const clang::Token &MacroNameTok, + const MacroDefinition &MD) override { + flushCurrentExpansion(); + handleMacroDirective(Loc, /*AnchorDiff=*/1); + } + + void Else(SourceLocation Loc, SourceLocation IfLoc) override { + flushCurrentExpansion(Loc); + handleMacroDirective(Loc, /*AnchorDiff=*/1); + } + + void Endif(SourceLocation Loc, SourceLocation IfLoc) override { + flushCurrentExpansion(); + handleMacroDirective(Loc, /*AnchorDiff=*/1); + } + + // FIXME: missing moduleImport(), Ident(), ... + + void PragmaDirective(SourceLocation Loc, + PragmaIntroducerKind Introducer) override { + if (!InsideMainFile) + return; + assert(PragmaStart.isInvalid() && "Recursive #pragma directives?"); + PragmaStart = Loc; + } + + void tokenLexed(const clang::Token &T) { + if (!InsideMainFile) + return; + auto L = T.getLocation(); + assert(L.isValid()); + + // Parser sometimes goes through the same tokens again, we are only + // interested in the initial iteration. + if (!Result.Tokens.empty() && + !SM.isBeforeInTranslationUnit(Result.Tokens.back().location(), L)) + return; + flushCurrentExpansion(L); + + if (ExpansionStart.isValid() && SM.getExpansionLoc(L) != ExpansionStart) { + // There are intermediate macro argument expansions. Skip them, they will + // be reported again later. + return; + } + + DEBUG_WITH_TYPE("collect-tokens", + llvm::dbgs() << llvm::formatv( + "$[token], name - {0}, length - {1}, spelling - {2}\n", + tok::getTokenName(T.getKind()), T.getLength(), + Lexer::getSpelling(T, SM, LO))); + Result.Tokens.push_back(syntax::Token(T)); + assert(Result.Tokens.back().location().isValid()); + + // Process the end of #pragma directive. + if (PragmaStart.isValid() && T.getKind() == tok::eod) { + handleMacroDirective(PragmaStart, /*AnchorDiff=*/0); + PragmaStart = SourceLocation(); + return; + } + } + + void MacroExpands(const clang::Token &MacroNameTok, const MacroDefinition &MD, + SourceRange Range, const MacroArgs *Args) override { + if (!InsideMainFile) + return; + + auto MacroNameLoc = MacroNameTok.getLocation(); + flushCurrentExpansion(MacroNameLoc); + + // Note that MacroNameTok was not reported yet. + auto ExpansionStart = + std::find_if(Result.Tokens.rbegin(), Result.Tokens.rend(), + [&](const syntax::Token &T) { + return SM.isBeforeInTranslationUnit(T.location(), + MacroNameLoc); + }) + .base(); + if (ExpansionFile.isValid()) { + // This is a recursive macro expansion, so we do not need to record it. + DEBUG_WITH_TYPE("collect-tokens", + llvm::dbgs() << llvm::formatv( + "$[macro-expands] dropping {0} macro tokens\n", + std::distance(ExpansionStart, Result.Tokens.end()))); + Result.Tokens.erase(ExpansionStart, Result.Tokens.end()); + return; + } + // This is a new top-level macro expansion, record it. + MacroExpansion MC; + MC.BeginFileToken = Result.MacroTokens.size(); + MC.EndFileToken = + MC.BeginFileToken + (Result.Tokens.end() - ExpansionStart) + 1; + // Store the macro name and macro arguments, they are used when calculating + // textual. + Result.MacroTokens.push_back(syntax::Token(MacroNameTok)); + for (auto &T : llvm::make_range(ExpansionStart, Result.Tokens.end())) + Result.MacroTokens.push_back(T); + // Macro call tokens are not part of the expanded tokens, so remove them. + DEBUG_WITH_TYPE("collect-tokens", + llvm::dbgs() << llvm::formatv( + "$[macro-expands] dropping {0} macro tokens\n", + std::distance(ExpansionStart, Result.Tokens.end()))); + Result.Tokens.erase(ExpansionStart, Result.Tokens.end()); + + MC.BeginExpansionToken = Result.Tokens.size(); + // MC.EndExpansionToken is filled after the expansion finishes. + Result.Expansions.push_back(MC); + // We need to record where expansion ends in order to track it properly. + std::tie(ExpansionFile, ExpansionEndOffset) = + SM.getDecomposedLoc(Range.getEnd()); + this->ExpansionStart = Range.getBegin(); + } + +private: + void handleMacroDirective(SourceLocation Anchor, int AnchorOffset) { + if (!InsideMainFile) + return; + + flushCurrentExpansion(Anchor); + + assert(!Result.Tokens.empty()); + assert(Result.Tokens.back().kind() == tok::eod); + auto MacroStart = std::find_if(Result.Tokens.rbegin(), Result.Tokens.rend(), + [&](const syntax::Token &T) { + return T.location() == Anchor; + }) + .base(); + // MacroStart now points a few tokens after the start of the macro, e.g. + // # define MACRO ^... + // # include ^... + // we want to move it to point to 'define' or 'include', respectively. + // # ^define MACRO ... + // # ^include ... + assert(std::distance(Result.Tokens.begin(), MacroStart) >= AnchorOffset); + std::advance(MacroStart, -AnchorOffset); + + DEBUG_WITH_TYPE("collect-tokens", + llvm::dbgs() << llvm::formatv( + "$[pp-directive] dropping {0} macro directive tokens\n", + std::distance(MacroStart, Result.Tokens.end()))); + Result.Tokens.erase(MacroStart, Result.Tokens.end()); + } + +private: + void flushCurrentExpansion() { + if (!ExpansionFile.isValid()) + return; + assert(!Result.Expansions.empty()); + assert(Result.Expansions.back().EndExpansionToken == 0); + Result.Expansions.back().EndExpansionToken = Result.Tokens.size(); + + ExpansionFile = FileID(); + ExpansionStart = SourceLocation(); + ExpansionEndOffset = 0; + } + + void flushCurrentExpansion(SourceLocation L) { + assert(L.isValid()); + if (!ExpansionFile.isValid()) + return; + FileID File; + unsigned Offset; + std::tie(File, Offset) = SM.getDecomposedLoc(L); + if (File != ExpansionFile || Offset <= ExpansionEndOffset) + return; + // Check we are not inside the current macro arguments. + flushCurrentExpansion(); + } + + bool InsideMainFile = false; + // The start location of the currently processed #pragma directive. + SourceLocation PragmaStart; + /// When valid, the range of the last active top-level macro expansion. + FileID ExpansionFile; + SourceLocation ExpansionStart; + unsigned ExpansionEndOffset = 0; + TokenBuffer &Result; + const SourceManager &SM; + const LangOptions &LO; +}; + +llvm::ArrayRef<syntax::Token> +MacroExpansion::tokens(const TokenBuffer &B) const { + return B.tokens().slice(BeginExpansionToken, + EndExpansionToken - BeginExpansionToken); +} + +llvm::ArrayRef<syntax::Token> +MacroExpansion::macroTokens(const TokenBuffer &B) const { + return B.macroTokens().slice(BeginFileToken, EndFileToken - BeginFileToken); +} + +std::pair<unsigned, unsigned> +MacroExpansion::macroRange(const TokenBuffer &B, + const SourceManager &SM) const { + auto M = macroTokens(B); + return {SM.getFileOffset(M.front().location()), + SM.getFileOffset(M.back().endLocation())}; +} + +TokenBuffer::TokenBuffer(std::vector<syntax::Token> Tokens) + : Tokens(std::move(Tokens)) { +#ifndef NDEBUG + for (const auto &T : this->Tokens) + assert(T.location().isFileID()); +#endif +} + +TokenCollector::TokenCollector(Preprocessor &PP) { + auto CBOwner = llvm::make_unique<Callbacks>(PP.getSourceManager(), + PP.getLangOpts(), Tokens); + auto *CB = CBOwner.get(); + + PP.addPPCallbacks(std::move(CBOwner)); + PP.setTokenWatcher([CB](const clang::Token &T) { CB->tokenLexed(T); }); +} + +TokenBuffer TokenCollector::consume() && { return std::move(Tokens); } + +llvm::Optional<std::pair<unsigned, unsigned>> +TokenBuffer::toOffsetRange(const Token *Begin, const Token *End, + const SourceManager &SM) const { + assert(Begin < End); + unsigned BeginIndex = Begin - Tokens.data(); + unsigned EndIndex = End - Tokens.data(); + + // Find the first macro call that intersects with our range. + auto FirstCall = + std::upper_bound(Expansions.begin(), Expansions.end(), BeginIndex, + [](unsigned L, const MacroExpansion &R) { + return L < R.BeginExpansionToken; + }); + if (FirstCall != Expansions.begin()) { + --FirstCall; + if (FirstCall->EndExpansionToken <= BeginIndex) + FirstCall = Expansions.end(); + } else { + FirstCall = Expansions.end(); + } + // Find the last macro call that intersects with our range. + auto LastCall = + std::lower_bound(Expansions.begin(), Expansions.end(), EndIndex, + [](const MacroExpansion &L, unsigned R) { + return L.EndExpansionToken < R; + }); + if (LastCall != Expansions.end() && EndIndex <= LastCall->BeginExpansionToken) + LastCall = Expansions.end(); + // Only allow changes that involve the whole macro calls, disallow anything + // that changes macros in between. + // FIXME: also allow changes uniquely mapping to macro arguments. + assert(FirstCall == Expansions.end() || LastCall == Expansions.end() || + FirstCall <= LastCall); + + // Check the first macro call is fully-covered. + if (FirstCall != Expansions.end() && + (FirstCall->BeginExpansionToken < BeginIndex || + EndIndex < FirstCall->EndExpansionToken)) { + return llvm::None; + } + // Check the last macro call is fully-covered. + if (LastCall != Expansions.end() && + (LastCall->BeginExpansionToken < BeginIndex || + EndIndex < LastCall->EndExpansionToken)) { + return llvm::None; + } + + unsigned BeginOffset = + SM.getFileOffset(FirstCall != Expansions.end() + ? FirstCall->macroTokens(*this).front().location() + : Begin->location()); + unsigned EndOffset = + SM.getFileOffset(LastCall != Expansions.end() + ? LastCall->macroTokens(*this).back().endLocation() + : std::prev(End)->endLocation()); + return std::make_pair(BeginOffset, EndOffset); +} Index: clang/lib/Tooling/Syntax/CMakeLists.txt =================================================================== --- /dev/null +++ clang/lib/Tooling/Syntax/CMakeLists.txt @@ -0,0 +1,10 @@ +set(LLVM_LINK_COMPONENTS Support) + +add_clang_library(clangToolingSyntax + TokenBuffer.cpp + + LINK_LIBS + clangBasic + clangFrontend + clangLex + ) Index: clang/lib/Tooling/CMakeLists.txt =================================================================== --- clang/lib/Tooling/CMakeLists.txt +++ clang/lib/Tooling/CMakeLists.txt @@ -7,6 +7,7 @@ add_subdirectory(Inclusions) add_subdirectory(Refactoring) add_subdirectory(ASTDiff) +add_subdirectory(Syntax) add_clang_library(clangTooling AllTUsExecution.cpp Index: clang/include/clang/Tooling/Syntax/TokenBuffer.h =================================================================== --- /dev/null +++ clang/include/clang/Tooling/Syntax/TokenBuffer.h @@ -0,0 +1,185 @@ +//===- TokenBuffer.h - store tokens of preprocessed files -----*- C++ -*-=====// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKEN_BUFFER_H +#define LLVM_CLANG_TOOLING_SYNTAX_TOKEN_BUFFER_H + +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.def" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include <cstdint> + +namespace clang { +class Preprocessor; + +namespace syntax { +class TokenBuffer; + +/// A token coming directly from a file or a macro expansion. Has just enough +/// information to locate the token in the source code. +class Token { +public: + Token() = default; + Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind) + : Location(Location), Length(Length), Kind(Kind) {} + /// EXPECTS: clang::Token is not an annotation token. + explicit Token(const clang::Token &T); + + tok::TokenKind kind() const { return Kind; } + SourceLocation location() const { return Location; } + SourceLocation endLocation() const { + return Location.getLocWithOffset(Length); + } + unsigned length() const { return Length; } + + /// Get the substring covered by the token. Note that will include all + /// digraphs, newline continuations, etc. E.g. 'int' and + /// in\ + /// t + /// both same kind tok::kw_int, but results of getText are different. + llvm::StringRef text(const SourceManager &SM) const; + +private: + SourceLocation Location; + unsigned Length = 0; + tok::TokenKind Kind = tok::NUM_TOKENS; +}; + +static_assert(sizeof(Token) <= 16, "Token is unresonably large"); + +/// A top-level macro expansion inside a file. +class MacroExpansion { +public: + /// The tokens obtained after expansion. + llvm::ArrayRef<syntax::Token> tokens(const TokenBuffer &B) const; + /// These cover the name and arguments of a macro (if any), incluging the + /// parentheses around macro arguments. + llvm::ArrayRef<syntax::Token> macroTokens(const TokenBuffer &B) const; + /// Range of offsets covering the name of a macro expansion or the name and + /// arguments of a functional macro invocation. + std::pair<unsigned, unsigned> macroRange(const TokenBuffer &B, + const SourceManager &SM) const; + +private: + friend class TokenCollector; + friend class TokenBuffer; + unsigned BeginExpansionToken = 0; + unsigned EndExpansionToken = 0; + unsigned BeginFileToken = 0; + unsigned EndFileToken = 0; +}; + +/// A list of tokens obtained by lexing and preprocessing a text buffer and a +/// set of helpers to allow mapping the tokens after preprocessing to the +/// corresponding code written in a file. TokenBuffer has information about two +/// token streams: +/// 1. tokens produced by the preprocessor, i.e. after all macro expansions, +/// 2. pre-expansion tokens that corresponds to the source code of a file. +/// The tokens for (1) are stored directly and can be accessed with the tokens() +/// method. However, some of these tokens may come from macro expansions and so +/// they don't correspond directly to any text in a file, e.g. +/// +/// #define FOO 10 +/// int a = FOO; // no token '10' in the file, just 'FOO' +/// +/// For these tokens, TokenBuffer allows to obtain the macro name and macro +/// arguments that were used to produce the expansion with the 'toOffsetRange()' +/// method. +/// There are two ways to build a TokenBuffer: +/// 1. If you are running a clang frontend invocation, use the TokenCollector +/// class, +/// 2. if you only need to lex a file, use the tokenize() helper. +class TokenBuffer { +public: + TokenBuffer() = default; + // Assumes no macro expansions have taken place. + TokenBuffer(std::vector<syntax::Token> Tokens); + + /// All tokens from the result of preprocessor expansion, i.e. the list of + /// tokens produced by the preprocessor. Source locations in the clang AST + /// should always point into any of these nodes. + llvm::ArrayRef<syntax::Token> tokens() const { return Tokens; } + /// Attempt the map a range of expanded tokens into a continuous substring of + /// the original source file. The tranformation may not be possible if the + /// range requires changing the macro expansions. + llvm::Optional<std::pair<unsigned, unsigned>> + toOffsetRange(const Token *Begin, const Token *End, + const SourceManager &SM) const; + + /// All top-level macro expansions from the corresponding file. Includes + /// functional macro invocation and expansion of macro identifiers. E.g would + /// contain 3 entries for the following code: + /// #define FOO 2*5 + /// #define BAR(a,b) a+b+FOO + /// BAR(FOO, FOO) // #1 + /// int a = FOO; // #2 + /// int b = BAR(a, BAR(6, FOO)); // #3 + /// Note that neither expansions inside macro arguments (e.g. 'FOO' in + /// 'BAR(FOO, FOO)') nor recursive macro expansions are present in the + /// result. + llvm::ArrayRef<MacroExpansion> expansions() const { return Expansions; } + /// Tokens of macro directives and top-level macro expansions. These are not + /// part of the expanded token stream, but they fill the gaps for the file. + /// Here is an example: + /// #define DECL(name) int name = 10 + /// DECL(a); + /// For the input above, we would get tokens() = {"int", "a", "=", "10", ";"} + /// and macroTokens() = {"DECL", "(", "a", ")"}. + llvm::ArrayRef<syntax::Token> macroTokens() const { return MacroTokens; } + +private: + friend class TokenCollector; + friend class MacroExpansion; + /// Expanded tokens, the ASTs are built on top of these. Some of the tokens + /// have file locations and can be used to obtain the file offsets directly. + std::vector<syntax::Token> Tokens; + /// Tokens forming top-level macro expansions, i.e. all macro names and macro + /// arguments. + std::vector<syntax::Token> MacroTokens; + /// A list of top-level macro expansions inside a particular file. + std::vector<MacroExpansion> Expansions; +}; + +/// Lex the text buffer, corresponding to \p FID, in raw mode and record the +/// resulting tokens. Does minimal post-processing on raw identifiers, setting +/// their corresponding token kind. This is a very low-level function, most +/// users should prefer to use TokenCollector. Lexing in raw mode produces +/// wildly different results from what one might expect when running a C++ +/// frontend, e.g. preprocessor does not run at all. +TokenBuffer tokenize(FileID FID, const SourceManager &SM, + const LangOptions &LO); + +/// Collects tokens for the main file while running the frontend action. An +/// instance of this object should be created on +/// FrontendAction::BeginSourceFile() and the results should be consumed after +/// FrontendAction::Execute() finishes. +class TokenCollector { +public: + /// Adds the hooks to collect the tokens. Should be called before the + /// preprocessing starts, i.e. as a part of BeginSourceFile() or + /// CreateASTConsumer(). + TokenCollector(Preprocessor &P); + + /// Consumes the result. Should be called after preprocessing is finished, + /// i.e. after running Execute(). + TokenBuffer consume() &&; + +private: + class Callbacks; + TokenBuffer Tokens; +}; + +} // namespace syntax +} // namespace clang + +#endif
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits