[PATCH] D59887: [Syntax] Introduce TokenBuffer, start clangToolingSyntax library

Ilya Biryukov via Phabricator via cfe-commits Fri, 05 Apr 2019 09:36:53 -0700

ilya-biryukov updated this revision to Diff 193903.
ilya-biryukov marked 2 inline comments as done.
ilya-biryukov added a comment.
Herald added a subscriber: mgrang.


Changes:

- Add multi-file support, record a single expanded stream and per-file-id raw 
token streams and mappings.
- Rename MacroInvocation to TokenBuffer::Mapping, make it private.
- Simplify TokenCollector, let preprocessor handle some more stuff.

TODO:

- update the docs
- go through other comments again
- write more tests


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D59887/new/

https://reviews.llvm.org/D59887

Files:
  clang/include/clang/Tooling/Syntax/Tokens.h
  clang/lib/Tooling/CMakeLists.txt
  clang/lib/Tooling/Syntax/CMakeLists.txt
  clang/lib/Tooling/Syntax/Tokens.cpp
  clang/unittests/Tooling/CMakeLists.txt
  clang/unittests/Tooling/Syntax/CMakeLists.txt
  clang/unittests/Tooling/Syntax/TokensTest.cpp

Index: clang/unittests/Tooling/Syntax/TokensTest.cpp
===================================================================
--- /dev/null
+++ clang/unittests/Tooling/Syntax/TokensTest.cpp
@@ -0,0 +1,602 @@
+//===- TokensTest.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Tokens.h"
+#include "clang/AST/ASTConsumer.h"
+#include "clang/AST/Expr.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticIDs.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/FileSystemOptions.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.def"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/FrontendAction.h"
+#include "clang/Frontend/Utils.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Lex/Token.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Testing/Support/Annotations.h"
+#include "gmock/gmock-more-matchers.h"
+#include <cassert>
+#include <cstdlib>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include <ostream>
+#include <string>
+
+using namespace clang;
+using namespace clang::syntax;
+
+using ::testing::AllOf;
+using ::testing::Contains;
+using ::testing::ElementsAre;
+using ::testing::Matcher;
+using ::testing::Pointwise;
+
+namespace {
+// Matchers for syntax::Token.
+MATCHER_P(Kind, K, "") { return arg.kind() == K; }
+MATCHER_P2(HasText, Text, SourceMgr, "") {
+  return arg.text(*SourceMgr) == Text;
+}
+MATCHER_P2(IsIdent, Text, SourceMgr, "") {
+  return arg.kind() == tok::identifier && arg.text(*SourceMgr) == Text;
+}
+/// Checks the start and end location of a token are equal to SourceRng.
+MATCHER_P(RangeIs, SourceRng, "") {
+  return arg.location() == SourceRng.first &&
+         arg.endLocation() == SourceRng.second;
+}
+/// Checks the passed tuple has two similar tokens, i.e. both are of the same
+/// kind and have the same text if they are identifiers.
+/// Ignores differences in kind between the raw and non-raw mode.
+MATCHER_P(IsSameToken, SourceMgr, "") {
+  auto ToEquivalenceClass = [](tok::TokenKind Kind) {
+    if (Kind == tok::identifier || Kind == tok::raw_identifier ||
+        tok::getKeywordSpelling(Kind) != nullptr)
+      return tok::identifier;
+    if (Kind == tok::string_literal || Kind == tok::header_name)
+      return tok::string_literal;
+    return Kind;
+  };
+
+  auto &L = std::get<0>(arg);
+  auto &R = std::get<1>(arg);
+  if (ToEquivalenceClass(L.kind()) != ToEquivalenceClass(R.kind()))
+    return false;
+  return L.text(*SourceMgr) == L.text(*SourceMgr);
+}
+} // namespace
+
+// Actual test fixture lives in the syntax namespace as it's a friend of
+// TokenBuffer.
+class syntax::TokensTest : public ::testing::Test {
+public:
+  /// Run the clang frontend, collect the preprocessed tokens from the frontend
+  /// invocation and store them in this->Buffer.
+  /// This also clears SourceManager before running the compiler.
+  void recordTokens(llvm::StringRef Code) {
+    class RecordTokens : public ASTFrontendAction {
+    public:
+      explicit RecordTokens(TokenBuffer &Result) : Result(Result) {}
+
+      bool BeginSourceFileAction(CompilerInstance &CI) override {
+        assert(!Collector && "expected only a single call to BeginSourceFile");
+        Collector.emplace(CI.getPreprocessor());
+        return true;
+      }
+      void EndSourceFileAction() override {
+        assert(Collector && "BeginSourceFileAction was never called");
+        Result = std::move(*Collector).consume();
+      }
+
+      std::unique_ptr<ASTConsumer>
+      CreateASTConsumer(CompilerInstance &CI, StringRef InFile) override {
+        return llvm::make_unique<ASTConsumer>();
+      }
+
+    private:
+      TokenBuffer &Result;
+      llvm::Optional<TokenCollector> Collector;
+    };
+
+    constexpr const char *FileName = "./input.cpp";
+    FS->addFile(FileName, time_t(), llvm::MemoryBuffer::getMemBufferCopy(""));
+    // Prepare to run a compiler.
+    std::vector<const char *> Args = {"tok-test", "-std=c++03", "-fsyntax-only",
+                                      FileName};
+    auto CI = createInvocationFromCommandLine(Args, Diags, FS);
+    assert(CI);
+    CI->getFrontendOpts().DisableFree = false;
+    CI->getPreprocessorOpts().addRemappedFile(
+        FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release());
+    LangOpts = *CI->getLangOpts();
+    CompilerInstance Compiler;
+    Compiler.setInvocation(std::move(CI));
+    if (!Diags->getClient())
+      Diags->setClient(new IgnoringDiagConsumer);
+    Compiler.setDiagnostics(Diags.get());
+    Compiler.setFileManager(FileMgr.get());
+    Compiler.setSourceManager(SourceMgr.get());
+
+    this->Buffer = TokenBuffer();
+    RecordTokens Recorder(this->Buffer);
+    ASSERT_TRUE(Compiler.ExecuteAction(Recorder))
+        << "failed to run the frontend";
+
+    DEBUG_WITH_TYPE("syntax-tokens-test", {
+      llvm::dbgs() << "=== Recorded token stream:\n";
+      this->Buffer.dump(llvm::dbgs(), *SourceMgr);
+    });
+  }
+
+  /// Run syntax::tokenize() and return the results.
+  std::vector<syntax::Token> tokenize(llvm::StringRef Text) {
+    // Null-terminate so that we always see 'tok::eof' at the end.
+    std::string NullTerminated = Text.str();
+    auto FID = SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy(
+        StringRef(NullTerminated.data(), NullTerminated.size() + 1)));
+    return syntax::tokenize(FID, *SourceMgr, LangOpts);
+  }
+
+  /// Checks that lexing \p ExpectedText in raw mode would produce the same
+  /// token stream as the one stored in this->Buffer.expandedTokens().
+  void expectTokens(llvm::StringRef ExpectedText) {
+    std::vector<syntax::Token> ExpectedTokens = tokenize(ExpectedText);
+    EXPECT_THAT(std::vector<syntax::Token>(Buffer.expandedTokens()),
+                Pointwise(IsSameToken(), ExpectedTokens))
+        << "\texpected tokens: " << ExpectedText;
+  }
+
+  void expectSameTokens(llvm::ArrayRef<syntax::Token> Actual,
+                        llvm::ArrayRef<syntax::Token> Expected) {
+    EXPECT_THAT(std::vector<syntax::Token>(Actual),
+                Pointwise(IsSameToken(), std::vector<syntax::Token>(Expected)));
+  }
+
+  struct ExpectedInvocation {
+    ExpectedInvocation(
+        std::string From, std::string To,
+        llvm::Optional<llvm::Annotations::Range> Range = llvm::None)
+        : From(std::move(From)), To(std::move(To)), Range(Range) {}
+    /// A textual representation of the macro tokens.
+    std::string From;
+    /// A textual representation of the tokens after macro replacement.
+    std::string To;
+    /// A text range the macro invocation in the source code.
+    llvm::Optional<llvm::Annotations::Range> Range;
+  };
+
+  // FIXME: use a vocabulary range type instead.
+  std::pair<unsigned, unsigned>
+  mappingTextRange(const TokenBuffer::Mapping &M,
+                   const TokenBuffer::MarkedFile &F) {
+    assert(M.BeginRawToken < M.EndRawToken && "Invalid mapping");
+    return {
+        SourceMgr->getFileOffset(F.RawTokens.at(M.BeginRawToken).location()),
+        SourceMgr->getFileOffset(
+            F.RawTokens.at(M.EndRawToken - 1).endLocation())};
+  }
+
+  FileID findFile(llvm::StringRef Name) const {
+    const FileEntry* Entry = FileMgr->getFile(Name);
+    FileID Found = SourceMgr->translateFile(Entry);
+    if (!Found.isValid()) {
+      ADD_FAILURE() << "SourceManager does not track " << Name;
+      std::abort();
+    }
+    return Found;
+  }
+  /// Checks the this->Buffer.macroInvocations() for the main file match the \p
+  /// Expected ones.
+  void expectMacroInvocations(llvm::ArrayRef<ExpectedInvocation> Expected,
+                              FileID FID = FileID()) {
+    if (!FID.isValid())
+      FID = SourceMgr->getMainFileID();
+    EXPECT_TRUE(Buffer.Files.count(FID)) << "tokens for file were not recorded";
+    TokenBuffer::MarkedFile &File = Buffer.Files[FID];
+
+    llvm::ArrayRef<TokenBuffer::Mapping> Actual = File.Mappings;
+    ASSERT_EQ(Actual.size(), Expected.size());
+
+    for (unsigned I = 0; I < Actual.size(); ++I) {
+      const auto &A = Actual[I];
+      const auto &E = Expected[I];
+
+      if (E.Range)
+        ASSERT_EQ(mappingTextRange(A, File),
+                  (std::pair<unsigned, unsigned>(E.Range->Begin, E.Range->End)))
+            << "\trange does not match";
+
+      auto DropEOF = [](std::vector<syntax::Token> Tokens) {
+        if (Tokens.empty() || Tokens.back().kind() != tok::eof) {
+          ADD_FAILURE() << "expected 'eof' at the end of the tokens";
+          return Tokens;
+        }
+        Tokens.pop_back();
+        return Tokens;
+      };
+
+      std::vector<syntax::Token> ActualRaw(
+          File.RawTokens.begin() + A.BeginRawToken,
+          File.RawTokens.begin() + A.EndRawToken);
+      ASSERT_THAT(ActualRaw,
+                  Pointwise(IsSameToken(), DropEOF(tokenize(E.From))))
+          << "\tmacro tokens do not match, expected " << E.From;
+
+      std::vector<syntax::Token> ActualExpanded(
+          Buffer.ExpandedTokens.begin() + A.BeginExpandedToken,
+          Buffer.ExpandedTokens.begin() + A.EndExpandedToken);
+      ASSERT_THAT(ActualExpanded,
+                  Pointwise(IsSameToken(), DropEOF(tokenize(E.To))))
+          << "\ttokens after macro replacements do not match, expected "
+          << E.To;
+    }
+  }
+
+  // Specialized versions of matchers that rely on SourceManager.
+  Matcher<syntax::Token> IsIdent(std::string Text) const {
+    return ::IsIdent(Text, SourceMgr.get());
+  }
+  Matcher<syntax::Token> HasText(std::string Text) const {
+    return ::HasText(Text, SourceMgr.get());
+  }
+  Matcher<syntax::Token> RangeIs(llvm::Annotations::Range R) const {
+    std::pair<SourceLocation, SourceLocation> Ls;
+    Ls.first = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID())
+                   .getLocWithOffset(R.Begin);
+    Ls.second = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID())
+                    .getLocWithOffset(R.End);
+    return ::RangeIs(Ls);
+  }
+
+  Matcher<std::tuple<const syntax::Token &, const syntax::Token &>>
+  IsSameToken() const {
+    return ::IsSameToken(SourceMgr.get());
+  }
+
+  void addFile(llvm::StringRef Path, llvm::StringRef Contents) {
+    if (!FS->addFile(Path, time_t(),
+                     llvm::MemoryBuffer::getMemBufferCopy(Contents))) {
+      ADD_FAILURE() << "could not add a file to VFS: " << Path;
+    }
+  }
+
+  // Data fields.
+  llvm::IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
+      new DiagnosticsEngine(new DiagnosticIDs, new DiagnosticOptions);
+  IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS =
+      new llvm::vfs::InMemoryFileSystem;
+  llvm::IntrusiveRefCntPtr<FileManager> FileMgr =
+      new FileManager(FileSystemOptions(), FS);
+  llvm::IntrusiveRefCntPtr<SourceManager> SourceMgr =
+      new SourceManager(*Diags, *FileMgr);
+  /// Contains last result of calling recordTokens().
+  TokenBuffer Buffer;
+  /// Contains options from last run of recordTokens().
+  LangOptions LangOpts;
+};
+
+namespace {
+TEST_F(TokensTest, RawMode) {
+  EXPECT_THAT(tokenize("int main() {}"),
+              ElementsAre(Kind(tok::kw_int), IsIdent("main"),
+                          Kind(tok::l_paren), Kind(tok::r_paren),
+                          Kind(tok::l_brace), Kind(tok::r_brace),
+                          Kind(tok::eof)));
+  // Comments are ignored for now.
+  EXPECT_THAT(tokenize("/* foo */int a; // more comments"),
+              ElementsAre(Kind(tok::kw_int), IsIdent("a"), Kind(tok::semi),
+                          Kind(tok::eof)));
+}
+
+TEST_F(TokensTest, Basic) {
+  recordTokens("int main() {}");
+  EXPECT_THAT(Buffer.expandedTokens(),
+              ElementsAre(Kind(tok::kw_int), IsIdent("main"),
+                          Kind(tok::l_paren), Kind(tok::r_paren),
+                          Kind(tok::l_brace), Kind(tok::r_brace),
+                          Kind(tok::eof)));
+  // All kinds of whitespace are ignored.
+  recordTokens("\t\n  int\t\n  main\t\n  (\t\n  )\t\n{\t\n  }\t\n");
+  EXPECT_THAT(Buffer.expandedTokens(),
+              ElementsAre(Kind(tok::kw_int), IsIdent("main"),
+                          Kind(tok::l_paren), Kind(tok::r_paren),
+                          Kind(tok::l_brace), Kind(tok::r_brace),
+                          Kind(tok::eof)));
+
+  llvm::Annotations Code(R"cpp(
+    $r1[[int]] $r2[[a]] $r3[[=]] $r4[["foo bar baz"]] $r5[[;]]
+  )cpp");
+  recordTokens(Code.code());
+  EXPECT_THAT(
+      Buffer.expandedTokens(),
+      ElementsAre(AllOf(Kind(tok::kw_int), RangeIs(Code.range("r1"))),
+                  AllOf(Kind(tok::identifier), RangeIs(Code.range("r2"))),
+                  AllOf(Kind(tok::equal), RangeIs(Code.range("r3"))),
+                  AllOf(Kind(tok::string_literal), RangeIs(Code.range("r4"))),
+                  AllOf(Kind(tok::semi), RangeIs(Code.range("r5"))),
+                  Kind(tok::eof)));
+}
+
+TEST_F(TokensTest, MacroDirectives) {
+  // Macro directives are not stored anywhere at the moment.
+  llvm::StringLiteral Code = R"cpp(
+    #define FOO a
+    #include "unresolved_file.h"
+    #undef FOO
+    #ifdef X
+    #else
+    #endif
+    #ifndef Y
+    #endif
+    #if 1
+    #elif 2
+    #else
+    #endif
+    #pragma once
+    #pragma something lalala
+
+    int a;
+  )cpp";
+  recordTokens(Code);
+
+  expectTokens("int a;");
+  expectMacroInvocations({});
+
+  expectSameTokens(Buffer.rawTokens(SourceMgr->getMainFileID()),
+                   tokenize(Code));
+}
+
+TEST_F(TokensTest, MacroReplacements) {
+  // A simple object-like macro.
+  llvm::Annotations Code(R"cpp(
+    #define INT int const
+    [[INT]] a;
+    )cpp");
+  recordTokens(Code.code());
+
+  expectTokens("int const a;");
+  expectMacroInvocations({{"INT", "int const", Code.range()}});
+
+  // A simple function-like macro.
+  Code = llvm::Annotations(R"cpp(
+    #define INT(a) const int
+    [[INT(10+10)]] a;
+    )cpp");
+  recordTokens(Code.code());
+
+  expectTokens("const int a;");
+  expectMacroInvocations({{"INT(10+10)", "const int", Code.range()}});
+
+  // Recursive macro replacements.
+  Code = llvm::Annotations(R"cpp(
+    #define ID(X) X
+    #define INT int const
+    [[ID(ID(INT))]] a;
+  )cpp");
+  recordTokens(Code.code());
+
+  expectTokens("int const a;");
+  expectMacroInvocations({{"ID(ID(INT))", "int const", Code.range()}});
+
+  // A little more complicated recursive macro replacements.
+  Code = llvm::Annotations(R"cpp(
+    #define ADD(X, Y) X+Y
+    #define MULT(X, Y) X*Y
+
+    int a = [[ADD(MULT(1,2), MULT(3,ADD(4,5)))]];
+  )cpp");
+  recordTokens(Code.code());
+
+  expectTokens("int a = 1*2+3*4+5;");
+  expectMacroInvocations(
+      {{"ADD(MULT(1,2), MULT(3,ADD(4,5)))", "1*2+3*4+5", Code.range()}});
+
+  // Empty macro replacement.
+  Code = llvm::Annotations(R"cpp(
+    #define EMPTY
+    #define EMPTY_FUNC(X)
+    $m[[EMPTY]]
+    $f[[EMPTY_FUNC(1+2+3)]]
+  )cpp");
+  recordTokens(Code.code());
+
+  expectTokens("");
+  expectMacroInvocations({{"EMPTY", "", Code.range("m")},
+                          {"EMPTY_FUNC(1+2+3)", "", Code.range("f")}});
+}
+
+TEST_F(TokensTest, SpecialTokens) {
+  // Tokens coming from concatenations.
+  recordTokens(R"cpp(
+    #define CONCAT(a, b) a ## b
+    int a = CONCAT(1, 2);
+  )cpp");
+  expectTokens("int a = 12;");
+  // Multi-line tokens with slashes at the end.
+  recordTokens("i\\\nn\\\nt");
+  EXPECT_THAT(Buffer.expandedTokens(),
+              ElementsAre(AllOf(Kind(tok::kw_int), HasText("i\\\nn\\\nt")),
+                          Kind(tok::eof)));
+  // FIXME: test tokens with digraphs and UCN identifiers.
+}
+
+TEST_F(TokensTest, LateBoundTokens) {
+  // The parser eventually breaks the first '>>' into two tokens ('>' and '>'),
+  // but we choose to record them as a single token (for now).
+  llvm::Annotations Code(R"cpp(
+    template <class T>
+    struct foo { int a; };
+    int bar = foo<foo<int$br[[>>]]().a;
+    int baz = 10 $op[[>>]] 2;
+  )cpp");
+  recordTokens(Code.code());
+  EXPECT_THAT(std::vector<syntax::Token>(Buffer.expandedTokens()),
+              AllOf(Contains(AllOf(Kind(tok::greatergreater),
+                                   RangeIs(Code.range("br")))),
+                    Contains(AllOf(Kind(tok::greatergreater),
+                                   RangeIs(Code.range("op"))))));
+}
+
+TEST_F(TokensTest, DelayedParsing) {
+  llvm::StringLiteral Code = R"cpp(
+    struct Foo {
+      int method() {
+        // Parser will visit method bodies and initializers multiple times, but
+        // TokenBuffer should only record the first walk over the tokens;
+        return 100;
+      }
+      int a = 10;
+      int b = 20;
+
+      struct Subclass {
+        void foo() {
+          Foo().method();
+        }
+      };
+    };
+  )cpp";
+  recordTokens(Code);
+  // Checks that lexing in raw mode produces the same results, hence we're not
+  // recording any tokens twice and the order is the same.
+  expectTokens(Code);
+}
+
+TEST_F(TokensTest, Offsets) {
+  llvm::Annotations Code("");
+  /// Finds a token with the specified text.
+  auto Find = [this](llvm::StringRef Text) {
+    llvm::ArrayRef<syntax::Token> Tokens = Buffer.expandedTokens();
+    auto TokenMatches = [=](const syntax::Token &T) {
+      return T.text(*SourceMgr) == Text;
+    };
+    auto It = llvm::find_if(Tokens, TokenMatches);
+    if (It == Tokens.end()) {
+      ADD_FAILURE() << "could not find the token for " << Text;
+      std::abort();
+    }
+    if (std::find_if(std::next(It), Tokens.end(), TokenMatches) !=
+        Tokens.end()) {
+      ADD_FAILURE() << "token is not unique: " << Text;
+      std::abort();
+    };
+    return It;
+  };
+  auto Range = [&Code](llvm::StringRef Name) {
+    auto R = Code.range(Name);
+    return std::pair<unsigned, unsigned>(R.Begin, R.End);
+  };
+
+  Code = llvm::Annotations(R"cpp(
+    $all[[$first[[a1 a2 a3]] FIRST $second[[b1 b2]] LAST]]
+  )cpp");
+
+  recordTokens(Code.code());
+  EXPECT_EQ(
+      Buffer.toOffsetRange(Find("a1"), std::next(Find("LAST")), *SourceMgr),
+      Range("all"));
+  EXPECT_EQ(Buffer.toOffsetRange(Find("a1"), Find("FIRST"), *SourceMgr),
+            Range("first"));
+  EXPECT_EQ(Buffer.toOffsetRange(Find("b1"), Find("LAST"), *SourceMgr),
+            Range("second"));
+
+  Code = llvm::Annotations(R"cpp(
+    #define A a1 a2 a3
+    #define B b1 b2
+
+    $all[[$first[[A]] FIRST $second[[B]] LAST]]
+  )cpp");
+  recordTokens(Code.code());
+
+  EXPECT_EQ(
+      Buffer.toOffsetRange(Find("a1"), std::next(Find("LAST")), *SourceMgr),
+      Range("all"));
+  EXPECT_EQ(*Buffer.toOffsetRange(Find("a1"), Find("FIRST"), *SourceMgr),
+            Range("first"));
+  EXPECT_EQ(*Buffer.toOffsetRange(Find("b1"), Find("LAST"), *SourceMgr),
+            Range("second"));
+  // Ranges not fully covering macro invocations should fail.
+  EXPECT_EQ(Buffer.toOffsetRange(Find("a1"), Find("a3"), *SourceMgr),
+            llvm::None);
+  EXPECT_EQ(Buffer.toOffsetRange(Find("b2"), Find("LAST"), *SourceMgr),
+            llvm::None);
+  EXPECT_EQ(Buffer.toOffsetRange(Find("a2"), Find("b2"), *SourceMgr),
+            llvm::None);
+
+  Code = llvm::Annotations(R"cpp(
+    #define ID(x) x
+    #define B b1 b2
+
+    $both[[$first[[ID(ID(ID(a1) a2 a3))]] FIRST $second[[ID(B)]]]] LAST
+  )cpp");
+  recordTokens(Code.code());
+
+  EXPECT_EQ(Buffer.toOffsetRange(Find("a1"), Find("FIRST"), *SourceMgr),
+            Range("first"));
+  EXPECT_EQ(Buffer.toOffsetRange(Find("b1"), Find("LAST"), *SourceMgr),
+            Range("second"));
+  EXPECT_EQ(Buffer.toOffsetRange(Find("a1"), Find("LAST"), *SourceMgr),
+            Range("both"));
+
+  // Ranges crossing macro call boundaries.
+  EXPECT_EQ(Buffer.toOffsetRange(Find("a1"), Find("b2"), *SourceMgr),
+            llvm::None);
+  EXPECT_EQ(Buffer.toOffsetRange(Find("a2"), Find("b2"), *SourceMgr),
+            llvm::None);
+  // FIXME: next two examples should map to macro arguments, but currently they
+  //        fail.
+  EXPECT_EQ(Buffer.toOffsetRange(Find("a2"), Find("a3"), *SourceMgr),
+            llvm::None);
+  EXPECT_EQ(Buffer.toOffsetRange(Find("a1"), Find("a3"), *SourceMgr),
+            llvm::None);
+}
+
+TEST_F(TokensTest, SimpleMultiFile) {
+  addFile("./foo.h", R"cpp(
+    #define ADD(X, Y) X+Y
+    int a = 100;
+    #include "bar.h"
+  )cpp");
+  addFile("./bar.h", R"cpp(
+    int b = ADD(1, 2);
+    #define MULT(X, Y) X*Y
+  )cpp");
+  recordTokens(R"cpp(
+    #include "foo.h"
+    int c = ADD(1, MULT(2,3));
+  )cpp");
+
+  expectTokens(R"cpp(
+    int a = 100;
+    int b = 1+2;
+    int c = 1+2*3;
+  )cpp");
+  expectMacroInvocations({{"ADD(1,MULT(2,3))", "1+2*3"}});
+  expectMacroInvocations({{}}, findFile("./foo.h"));
+  expectMacroInvocations({{"ADD(1,2)", "1+2"}}, findFile("./bar.h"));
+}
+} // namespace
Index: clang/unittests/Tooling/Syntax/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang/unittests/Tooling/Syntax/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  Support
+  )
+
+add_clang_unittest(TokensTest
+  TokensTest.cpp
+)
+
+target_link_libraries(TokensTest
+  PRIVATE
+  clangAST
+  clangBasic
+  clangFrontend
+  clangLex
+  clangSerialization
+  clangTooling
+  clangToolingSyntax
+  LLVMTestingSupport
+  )
Index: clang/unittests/Tooling/CMakeLists.txt
===================================================================
--- clang/unittests/Tooling/CMakeLists.txt
+++ clang/unittests/Tooling/CMakeLists.txt
@@ -67,3 +67,6 @@
   clangToolingInclusions
   clangToolingRefactor
   )
+
+
+add_subdirectory(Syntax)
Index: clang/lib/Tooling/Syntax/Tokens.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Tokens.cpp
@@ -0,0 +1,362 @@
+//===- TokenBuffer.cpp - store tokens of preprocessed files ---*- C++ -*-=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "clang/Tooling/Syntax/Tokens.h"
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.def"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <iterator>
+#include <vector>
+
+using namespace clang;
+using namespace clang::syntax;
+
+syntax::Token::Token(const clang::Token &T)
+    : Token(T.getLocation(), T.getLength(), T.getKind()) {
+  assert(!T.isAnnotation());
+}
+
+llvm::StringRef syntax::Token::text(const SourceManager &SM) const {
+  bool Invalid = false;
+  const char *Start = SM.getCharacterData(location(), &Invalid);
+  assert(!Invalid);
+  return llvm::StringRef(Start, length());
+}
+
+std::vector<syntax::Token> syntax::tokenize(FileID FID, const SourceManager &SM,
+                                            const LangOptions &LO) {
+  std::vector<syntax::Token> Tokens;
+  IdentifierTable Identifiers(LO);
+  auto AddToken = [&](clang::Token T) {
+    if (T.getKind() == tok::raw_identifier && !T.needsCleaning() &&
+        !T.hasUCN()) { // FIXME: support needsCleaning and hasUCN cases.
+      clang::IdentifierInfo &II = Identifiers.get(T.getRawIdentifier());
+      T.setIdentifierInfo(&II);
+      T.setKind(II.getTokenID());
+    }
+    Tokens.push_back(syntax::Token(T));
+  };
+
+  Lexer L(FID, SM.getBuffer(FID), SM, LO);
+
+  clang::Token T;
+  while (!L.LexFromRawLexer(T))
+    AddToken(T);
+  AddToken(T);
+
+  return Tokens;
+}
+
+class TokenCollector::Callbacks : public PPCallbacks {
+public:
+  Callbacks(const SourceManager &SM, TokenBuffer &Result)
+      : Result(Result), SM(SM) {}
+
+  void FileChanged(SourceLocation Loc, FileChangeReason Reason,
+                   SrcMgr::CharacteristicKind FileType,
+                   FileID PrevFID) override {
+    assert(Loc.isFileID());
+    File = &Result.Files.try_emplace(SM.getFileID(Loc)).first->second;
+    flushMacroExpansion();
+  }
+
+  void tokenLexed(const clang::Token &T, TokenSource S) {
+    DEBUG_WITH_TYPE("tokens-test", {
+      // FIXME: this is probably too detailed and too much code.
+      auto Print = [](TokenSource S) {
+        switch (S) {
+        case TokenSource::File:
+          return "File";
+        case TokenSource::MacroExpansion:
+          return "MacroExpansion";
+        case TokenSource::AfterModuleImport:
+          return "AfterModuleImport";
+        case TokenSource::MacroNameOrArg:
+          return "MacroNameOrArg";
+        case TokenSource::MacroDirective:
+          return "MacroDirective";
+        case TokenSource::Precached:
+          return "Precached";
+        case TokenSource::SkippedPPBranch:
+          return "SkippedPPBranch";
+        }
+        llvm_unreachable("unhandled TokenSource");
+      };
+      llvm::dbgs() << "source = " << Print(S) << "\n";
+      llvm::dbgs() << "$[tokenLexed] ";
+      syntax::Token(T).dump(llvm::dbgs(), SM);
+      llvm::dbgs() << "\n";
+    });
+
+    if (S == TokenSource::Precached)
+      return; // the cached tokens are reported multiple times
+
+    auto L = T.getLocation();
+    flushCurrentExpansion(L);
+
+    if (ExpansionStart.isValid() && SM.getExpansionLoc(L) != ExpansionStart) {
+      // There are intermediate replacements while processing macro arguments.
+      // Skip them, they will be reported again.
+      return;
+    }
+
+    // 'eod' is a control token that we don't capture.
+    if (T.getKind() == tok::eod)
+      return;
+
+    DEBUG_WITH_TYPE("collect-tokens", {
+      llvm::dbgs() << "$[token] ";
+      syntax::Token(T).dump(llvm::dbgs(), SM);
+      llvm::dbgs() << "\n";
+    });
+
+    // Depending on where the token comes from, put it into an expanded token
+    // stream, a raw token stream, or both.
+    switch (S) {
+    case TokenSource::File:
+      assert(T.getLocation().isFileID());
+      Result.ExpandedTokens.push_back(syntax::Token(T));
+      File->RawTokens.push_back(syntax::Token(T));
+      break;
+    case clang::TokenSource::MacroExpansion:
+      assert(T.getLocation().isMacroID());
+      Result.ExpandedTokens.push_back(syntax::Token(T));
+      break;
+    case clang::TokenSource::MacroNameOrArg:
+    case TokenSource::MacroDirective:
+    case TokenSource::SkippedPPBranch:
+      assert(T.getLocation().isFileID());
+      File->RawTokens.push_back(syntax::Token(T));
+      break;
+    case TokenSource::Precached:
+      llvm_unreachable("cached tokens should be handled before");
+    case TokenSource::AfterModuleImport:
+      llvm_unreachable("not implemented yet");
+    }
+  }
+
+  void MacroExpands(const clang::Token &MacroNameTok, const MacroDefinition &MD,
+                    SourceRange Range, const MacroArgs *Args) override {
+    auto MacroNameLoc = MacroNameTok.getLocation();
+    flushCurrentExpansion(MacroNameLoc);
+
+    // We do not record recursive invocations.
+    if (isMacroExpanding())
+      return;
+
+    // Find the first raw token of the macro invocation, i.e. the name of the
+    // macro.
+    auto InvocationStart = std::find_if(
+        File->RawTokens.rbegin(), File->RawTokens.rend(),
+        [&](const syntax::Token &T) { return T.location() == MacroNameLoc; });
+    assert(InvocationStart != File->RawTokens.rend() &&
+           "the macro name of an invocation was not recorded.");
+
+    // This is a new top-level macro invocation, record it in the mappings.
+    TokenBuffer::Mapping M;
+    M.BeginRawToken =
+        std::prev(InvocationStart.base()) - File->RawTokens.begin();
+    M.EndRawToken = File->RawTokens.size();
+
+    M.BeginExpandedToken = Result.ExpandedTokens.size();
+    // MI.EndExpandedToken is filled by flushCurrentExpansion() when macro
+    // expansion finishes.
+
+    File->Mappings.push_back(M);
+
+    // We have to record where invocation ends in order to track it properly.
+    std::tie(MacroInvocationFile, ExpansionEndOffset) =
+        SM.getDecomposedLoc(Range.getEnd());
+    this->ExpansionStart = Range.getBegin();
+  }
+
+private:
+  bool isMacroExpanding() const { return MacroInvocationFile.isValid(); }
+
+  void flushMacroExpansion() {
+    if (!MacroInvocationFile.isValid())
+      return;
+    assert(!File->Mappings.empty());
+    assert(File->Mappings.back().EndExpandedToken == 0);
+    File->Mappings.back().EndExpandedToken = Result.ExpandedTokens.size();
+
+    MacroInvocationFile = FileID();
+    ExpansionStart = SourceLocation();
+    ExpansionEndOffset = 0;
+  }
+
+  void flushCurrentExpansion(SourceLocation L) {
+    assert(L.isValid());
+    if (!MacroInvocationFile.isValid())
+      return;
+    FileID File;
+    unsigned Offset;
+    std::tie(File, Offset) = SM.getDecomposedLoc(L);
+    if (File != MacroInvocationFile || Offset <= ExpansionEndOffset)
+      return;
+    // Check we are not inside the current macro arguments.
+    flushMacroExpansion();
+  }
+
+  TokenBuffer::MarkedFile *File = nullptr;
+  /// When valid, the file of the last active top-level macro invocation.
+  FileID MacroInvocationFile;
+  SourceLocation ExpansionStart;
+  unsigned ExpansionEndOffset = 0;
+  TokenBuffer &Result;
+  const SourceManager &SM;
+};
+
+// std::pair<unsigned, unsigned>
+// MacroInvocation::invocationRange(const TokenBuffer &B, FileID F,
+//                                  const SourceManager &SM) const {
+//   auto M = invocationTokens(B, F);
+//   return {SM.getFileOffset(M.front().location()),
+//           SM.getFileOffset(M.back().endLocation())};
+// }
+
+TokenCollector::TokenCollector(Preprocessor &PP) {
+  auto CBOwner = llvm::make_unique<Callbacks>(PP.getSourceManager(), Tokens);
+  auto *CB = CBOwner.get();
+
+  PP.addPPCallbacks(std::move(CBOwner));
+  PP.setTokenWatcher(
+      [CB](const clang::Token &T, TokenSource S) { CB->tokenLexed(T, S); });
+}
+
+TokenBuffer TokenCollector::consume() && { return std::move(Tokens); }
+
+llvm::Optional<std::pair<unsigned, unsigned>>
+TokenBuffer::toOffsetRange(const Token *Begin, const Token *End,
+                           const SourceManager &SM) const {
+  assert(Begin < End);
+
+  auto FileIt = Files.find(SM.getFileID(SM.getExpansionLoc(Begin->location())));
+  assert(FileIt != Files.end() && "no file for an expanded token");
+  // Crossing the file boundaries is not supported at the moment.
+  if (Begin != End && FileIt != Files.find(SM.getFileID(SM.getExpansionLoc(
+                                    std::prev(End)->location()))))
+    return llvm::None;
+  const MarkedFile &File = FileIt->second;
+
+  unsigned BeginIndex = Begin - ExpandedTokens.data();
+  unsigned EndIndex = End - ExpandedTokens.data();
+
+  // Find the first raw token mapping that intersects with our range.
+  auto FirstCall = std::upper_bound(
+      File.Mappings.begin(), File.Mappings.end(), BeginIndex,
+      [](unsigned L, const Mapping &R) { return L < R.BeginExpandedToken; });
+  if (FirstCall != File.Mappings.begin()) {
+    --FirstCall;
+    if (FirstCall->EndExpandedToken <= BeginIndex)
+      FirstCall = File.Mappings.end();
+  } else {
+    FirstCall = File.Mappings.end();
+  }
+  // Find the last macro call that intersects with our range.
+  auto LastCall = std::lower_bound(
+      File.Mappings.begin(), File.Mappings.end(), EndIndex,
+      [](const Mapping &L, unsigned R) { return L.EndExpandedToken < R; });
+  if (LastCall != File.Mappings.end() &&
+      EndIndex <= LastCall->BeginExpandedToken)
+    LastCall = File.Mappings.end();
+  // Only allow changes that involve the whole macro calls, disallow anything
+  // that changes macros in between.
+  // FIXME: also allow changes uniquely mapping to macro arguments.
+  assert(FirstCall == File.Mappings.end() || LastCall == File.Mappings.end() ||
+         FirstCall <= LastCall);
+
+  // Check the first macro call is fully-covered.
+  if (FirstCall != File.Mappings.end() &&
+      (FirstCall->BeginExpandedToken < BeginIndex ||
+       EndIndex < FirstCall->EndExpandedToken)) {
+    return llvm::None;
+  }
+  // Check the last macro call is fully-covered.
+  if (LastCall != File.Mappings.end() &&
+      (LastCall->BeginExpandedToken < BeginIndex ||
+       EndIndex < LastCall->EndExpandedToken)) {
+    return llvm::None;
+  }
+
+  unsigned BeginOffset =
+      SM.getFileOffset(FirstCall != File.Mappings.end()
+                           ? File.RawTokens[FirstCall->BeginRawToken].location()
+                           : Begin->location());
+  unsigned EndOffset = SM.getFileOffset(
+      LastCall != File.Mappings.end()
+          ? File.RawTokens[LastCall->EndRawToken - 1].endLocation()
+          : std::prev(End)->endLocation());
+  return std::make_pair(BeginOffset, EndOffset);
+}
+
+void syntax::Token::dump(llvm::raw_ostream &OS, const SourceManager &SM) const {
+  OS << llvm::formatv("Token({0}, length = {1}, location = {2}, text = {3})",
+                      tok::getTokenName(kind()), length(),
+                      location().printToString(SM), text(SM));
+}
+
+llvm::raw_ostream& syntax::operator<<(llvm::raw_ostream &OS, const Token &T) {
+  return OS << llvm::formatv("Token({0}, length = {1})",
+                             tok::getTokenName(T.kind()), T.length());
+}
+
+void TokenBuffer::dump(llvm::raw_ostream &OS, const SourceManager &SM) const {
+  OS << "expanded tokens:\n";
+  for (unsigned I = 0; I < ExpandedTokens.size(); ++I) {
+    OS << "  " << I << ": ";
+    ExpandedTokens[I].dump(OS, SM);
+    OS << "\n";
+  }
+
+  std::vector<FileID> Keys;
+  for (auto F : Files)
+    Keys.push_back(F.first);
+  llvm::sort(Keys);
+
+  for (FileID ID : Keys) {
+    const MarkedFile& File = Files.find(ID)->second;
+
+    auto *Entry = SM.getFileEntryForID(ID);
+    OS << "  file " << (Entry ? Entry->getName() : "<<virtual file>>" ) << "\n";
+    OS << "   raw tokens:\n";
+    for (unsigned I = 0; I < File.RawTokens.size(); ++I) {
+      OS << "    " << I << ": ";
+      File.RawTokens[I].dump(OS, SM);
+      OS << "\n";
+    }
+    OS << "   mappings:\n";
+    for (auto &M : File.Mappings)
+      OS << "    " << M.str() << "\n";
+  }
+}
+
+llvm::ArrayRef<syntax::Token> TokenBuffer::rawTokens(FileID FID) const {
+  auto It = Files.find(FID);
+  assert(It != Files.end());
+  return It->second.RawTokens;
+}
+
+std::string TokenBuffer::Mapping::str() const {
+  return llvm::formatv("raw tokens: [{0},{1}), expanded "
+                       "tokens: [{2},{3})",
+                       BeginRawToken, EndRawToken, BeginExpandedToken,
+                       EndExpandedToken);
+}
Index: clang/lib/Tooling/Syntax/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS Support)
+
+add_clang_library(clangToolingSyntax
+  Tokens.cpp
+
+  LINK_LIBS
+  clangBasic
+  clangFrontend
+  clangLex
+  )
Index: clang/lib/Tooling/CMakeLists.txt
===================================================================
--- clang/lib/Tooling/CMakeLists.txt
+++ clang/lib/Tooling/CMakeLists.txt
@@ -7,6 +7,7 @@
 add_subdirectory(Inclusions)
 add_subdirectory(Refactoring)
 add_subdirectory(ASTDiff)
+add_subdirectory(Syntax)
 
 add_clang_library(clangTooling
   AllTUsExecution.cpp
Index: clang/include/clang/Tooling/Syntax/Tokens.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Tokens.h
@@ -0,0 +1,212 @@
+//===- Tokens.h - collect when preprocessing ----------------------*- -*-=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
+#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
+
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+
+namespace clang {
+class Preprocessor;
+
+namespace syntax {
+class TokenBuffer;
+
+/// A token coming directly from a file or from a macro invocation. Has just
+/// enough information to locate the token in the source code.
+class Token {
+public:
+  Token() = default;
+  Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind)
+      : Location(Location), Length(Length), Kind(Kind) {}
+  /// EXPECTS: clang::Token is not an annotation token.
+  explicit Token(const clang::Token &T);
+
+  tok::TokenKind kind() const { return Kind; }
+  SourceLocation location() const { return Location; }
+  SourceLocation endLocation() const {
+    return Location.getLocWithOffset(Length);
+  }
+  unsigned length() const { return Length; }
+
+  /// Get the substring covered by the token. Note that will include all
+  /// digraphs, newline continuations, etc. E.g. 'int' and
+  ///    in\
+  ///    t
+  /// both same kind tok::kw_int, but results of getText are different.
+  llvm::StringRef text(const SourceManager &SM) const;
+
+  /// For debugging purposes. More verbose than stream output operator defined
+  /// below, but requires a source manager.
+  void dump(llvm::raw_ostream &OS, const SourceManager &SM) const;
+
+private:
+  SourceLocation Location;
+  unsigned Length = 0;
+  tok::TokenKind Kind = tok::NUM_TOKENS;
+};
+static_assert(sizeof(Token) <= 16, "Token is unreasonably large");
+/// For debugging purposes. Less verbose than dump().
+llvm::raw_ostream& operator<<(llvm::raw_ostream &OS, const Token &T);
+
+
+/// A list of tokens obtained by lexing and preprocessing a text buffer and a
+/// set of helpers to allow mapping the tokens after preprocessing to the
+/// corresponding code written in a file. TokenBuffer has information about two
+/// token streams:
+///    1. tokens produced by the preprocessor after all macro replacements,
+///    2. original tokens from the source code of a file before any macro
+///       replacements occurred.
+/// The tokens for (1) are stored directly and can be accessed with the tokens()
+/// method. However, some of these tokens may come from macro invocations and so
+/// they don't correspond directly to any text in a file, e.g.
+///
+///     #define FOO 10
+///     int a = FOO;  // no token '10' in the file, just 'FOO'
+///
+/// For these tokens, TokenBuffer allows to obtain the macro name and macro
+/// arguments that were originally seen in the source code with the
+/// 'toOffsetRange()' method.
+///
+/// There are two ways to build a TokenBuffer:
+///   1. If you are running a clang frontend invocation, use the TokenCollector
+///      class,
+///   2. if you only need to lex a file, use the tokenize() helper.
+class TokenBuffer {
+public:
+  /// All tokens produced by the preprocessor after macro replacements. Source
+  /// locations found in the clang AST will always point to one of the tokens in
+  /// the corresponding token buffer.
+  llvm::ArrayRef<syntax::Token> expandedTokens() const {
+    return ExpandedTokens;
+  }
+  /// Attempt to map a subrange of expandedTokens() into a continuous substring
+  /// of the original source file. The mapping fails if the ranges crosses
+  /// boundaries of macro invocations, that is, don't correspond to a complete
+  /// top-level macro invocation.
+  /// Given this source file:
+  ///
+  ///   #define FIRST f1 f2 f3
+  ///   #define SECOND s1 s2 3
+  ///
+  ///   a FIRST b SECOND c  // expansion: a f1 f2 f3 b s1 s2 s3 c
+  ///
+  /// toOffsetRange will map tokens like this:
+  ///   input range => output range
+  ///   ------
+  ///   a => a
+  ///   s1 s2 s3 => SECOND
+  ///   a f1 f2 f3 => a FIRST
+  ///   a f1 => can't map
+  ///   s1 s2 => can't map
+  /// Mapping will also fail when the start and end tokens are from different
+  /// files.
+  llvm::Optional<std::pair<unsigned, unsigned>>
+  toOffsetRange(const Token *Begin, const Token *End,
+                const SourceManager &SM) const;
+
+  /// Tokens of a file before preprocessing.
+  ///     #define DECL(name) int name = 10
+  ///     DECL(a);
+  /// For the input above, rawTokens() should return {"DECL", "(", "a", ")"}.
+  /// FIXME: we do not yet store tokens of directives, like #include, #define,
+  ///        #pragma, etc.
+  llvm::ArrayRef<syntax::Token> rawTokens(FileID FID) const;
+
+  /// For debugging purposes.
+  void dump(llvm::raw_ostream &OS, const SourceManager &SM) const;
+
+private:
+  /// Describes a mapping between a continuous subrange of raw tokens and the
+  /// expanded tokens. Represents macro expansions, preprocessor directives,
+  /// conditionally disabled pp regions, etc.
+  ///   #define FOO 1+2
+  ///   #define BAR(a) a + 1
+  ///   FOO    // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
+  ///   BAR(1) // invocation #2, tokens = {'a', '+', '1'},
+  ///                            macroTokens = {'BAR', '(', '1', ')'}.
+  struct Mapping {
+    // Positions in the corresponding raw token stream. The corresponding range
+    // is never empty.
+    unsigned BeginRawToken = 0;
+    unsigned EndRawToken = 0;
+    // Positions in the expanded token stream. The corresponding range can be
+    // empty.
+    unsigned BeginExpandedToken = 0;
+    unsigned EndExpandedToken = 0;
+
+    /// For debugging purposes.
+    std::string str() const;
+  };
+  /// Raw tokens of the file with information about the subranges.
+  struct MarkedFile {
+    /// Lexed, but not preprocessed, tokens of the file. These map directly to
+    /// text in the corresponding files and include tokens of all preprocessor
+    /// directives.
+    /// FIXME: raw tokens don't change across FileID that map to the same
+    ///        FileEntry. We could consider deduplicating them to save memory.
+    std::vector<syntax::Token> RawTokens;
+    /// A sorted list to convert between the raw and expanded token streams.
+    std::vector<Mapping> Mappings;
+  };
+
+  friend class TokenCollector;
+  // Testing code has access to internal mapping.
+  friend class TokensTest;
+
+  /// Token stream produced after preprocessing, conceputally this captures the
+  /// same stream 'clang -E' excluding the preprocessor directives (#file, etc.)
+  /// that it adds.
+  std::vector<syntax::Token> ExpandedTokens;
+  llvm::DenseMap<FileID, MarkedFile> Files;
+};
+
+
+/// Lex the text buffer, corresponding to \p FID, in raw mode and record the
+/// resulting tokens. Does minimal post-processing on raw identifiers, setting
+/// their corresponding token kind. This is a very low-level function, most
+/// users should prefer to use TokenCollector. Lexing in raw mode produces
+/// wildly different results from what one might expect when running a C++
+/// frontend, e.g. preprocessor does not run at all.
+std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
+                                    const LangOptions &LO);
+
+/// Collects tokens for the main file while running the frontend action. An
+/// instance of this object should be created on
+/// FrontendAction::BeginSourceFile() and the results should be consumed after
+/// FrontendAction::Execute() finishes.
+class TokenCollector {
+public:
+  /// Adds the hooks to collect the tokens. Should be called before the
+  /// preprocessing starts, i.e. as a part of BeginSourceFile() or
+  /// CreateASTConsumer().
+  TokenCollector(Preprocessor &P);
+
+  /// Finalizes token collection. Should be called after preprocessing is
+  /// finished, i.e. after running Execute().
+  LLVM_NODISCARD TokenBuffer consume() &&;
+
+private:
+  class Callbacks;
+  TokenBuffer Tokens;
+};
+
+} // namespace syntax
+} // namespace clang
+
+#endif

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D59887: [Syntax] Introduce TokenBuffer, start clangToolingSyntax library

Reply via email to