TokenBuffer stores the list of tokens for a file obtained after
preprocessing. This is a base building block for syntax trees,
see [1] for the full proposal on syntax trees.

This commits also starts a new sub-library of ClangTooling, which
would be the home for the syntax trees and syntax-tree-based refactoring


Index: clang/unittests/Tooling/Syntax/TokenBufferTest.cpp
--- /dev/null
+++ clang/unittests/Tooling/Syntax/TokenBufferTest.cpp
@@ -0,0 +1,471 @@
+//===- TokenBufferTest.cpp ------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "clang/Tooling/Syntax/TokenBuffer.h"
+#include "clang/AST/ASTConsumer.h"
+#include "clang/AST/Expr.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticIDs.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/FileSystemOptions.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.def"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/FrontendAction.h"
+#include "clang/Frontend/Utils.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Lex/Token.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Testing/Support/Annotations.h"
+#include <cassert>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include <ostream>
+#include <string>
+using namespace clang;
+using namespace clang::syntax;
+using ::testing::AllOf;
+using ::testing::Contains;
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::Matcher;
+using ::testing::Pointwise;
+// Debug printers.
+// FIXME: This should live somewhere else or be implemented as 'operator
+// <<(raw_ostream&, T)'.
+namespace clang {
+namespace tok {
+inline void PrintTo(TokenKind K, std::ostream *OS) {
+  *OS << tok::getTokenName(K);
+} // namespace tok
+namespace syntax {
+inline void PrintTo(const syntax::Token &T, std::ostream *OS) {
+  PrintTo(T.kind(), OS);
+  OS->flush();
+} // namespace syntax
+} // namespace clang
+namespace {
+// Matchers for clang::Token.
+MATCHER_P(Kind, K, "") { return arg.kind() == K; }
+MATCHER_P2(HasText, Text, SourceMgr, "") {
+  return arg.text(*SourceMgr) == Text;
+MATCHER_P2(IsIdent, Text, SourceMgr, "") {
+  return arg.kind() == tok::identifier && arg.text(*SourceMgr) == Text;
+/// Checks the start and end location of a token are equal to SourceRng.
+MATCHER_P(RangeIs, SourceRng, "") {
+  return arg.location() == SourceRng.first &&
+         arg.endLocation() == SourceRng.second;
+/// Checks the passed tuple has two similar tokens, i.e. both are of the same
+/// kind and have the same text if they are identifiers.
+MATCHER_P(IsSameToken, SourceMgr, "") {
+  auto &L = std::get<0>(arg);
+  auto &R = std::get<1>(arg);
+  if (L.kind() != R.kind())
+    return false;
+  return L.text(*SourceMgr) == L.text(*SourceMgr);
+class TokenBufferTest : public ::testing::Test {
+  /// Run the clang frontend, collect the preprocessed tokens from the frontend
+  /// invocation and store them in this->Tokens.
+  /// This also clears SourceManager before running the compiler.
+  void recordTokens(llvm::StringRef Code) {
+    class RecordTokens : public ASTFrontendAction {
+    public:
+      explicit RecordTokens(TokenBuffer &Result) : Result(Result) {}
+      bool BeginSourceFileAction(CompilerInstance &CI) override {
+        assert(!Collector && "expected only a single call to BeginSourceFile");
+        Collector.emplace(CI.getPreprocessor());
+        return true;
+      }
+      void EndSourceFileAction() override {
+        assert(Collector && "BeginSourceFileAction was never called");
+        Result = std::move(*Collector).consume();
+      }
+      std::unique_ptr<ASTConsumer>
+      CreateASTConsumer(CompilerInstance &CI, StringRef InFile) override {
+        return llvm::make_unique<ASTConsumer>();
+      }
+    private:
+      TokenBuffer &Result;
+      llvm::Optional<TokenCollector> Collector;
+    };
+    constexpr const char *FileName = "./input.cpp";
+    FS->addFile(FileName, time_t(), llvm::MemoryBuffer::getMemBufferCopy(""));
+    // Prepare to run a compiler.
+    std::vector<const char *> Args = {"tok-test", "-std=c++03", "-fsyntax-only",
+                                      FileName};
+    auto CI = createInvocationFromCommandLine(Args, Diags, FS);
+    assert(CI);
+    CI->getFrontendOpts().DisableFree = false;
+    CI->getPreprocessorOpts().addRemappedFile(
+        FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release());
+    LangOpts = *CI->getLangOpts();
+    CompilerInstance Compiler;
+    Compiler.setInvocation(std::move(CI));
+    if (!Diags->getClient())
+      Diags->setClient(new IgnoringDiagConsumer);
+    Compiler.setDiagnostics(Diags.get());
+    Compiler.setFileManager(FileMgr.get());
+    Compiler.setSourceManager(SourceMgr.get());
+    this->Buffer = TokenBuffer();
+    RecordTokens Recorder(this->Buffer);
+    ASSERT_TRUE(Compiler.ExecuteAction(Recorder))
+        << "failed to run the frontend";
+  }
+  /// Run syntax::tokenize() and return the results.
+  TokenBuffer tokenize(llvm::StringRef Text) {
+    // Null-terminate so that we always see 'tok::eof' at the end.
+    std::string NullTerminated = Text.str();
+    auto FID = SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy(
+        StringRef(, NullTerminated.size() + 1)));
+    return syntax::tokenize(FID, *SourceMgr, LangOpts);
+  }
+  /// Checks that lexing \p ExpectedText in raw mode would produce the same
+  /// token stream as the one stored in this->Buffer.tokens().
+  void checkTokens(llvm::StringRef ExpectedText) {
+    auto TokenizedCode = tokenize(ExpectedText);
+    std::vector<syntax::Token> ExpectedTokens = TokenizedCode.tokens();
+    EXPECT_THAT(std::vector<syntax::Token>(Buffer.tokens()),
+                Pointwise(IsSameToken(), ExpectedTokens))
+        << "\texpected tokens: " << ExpectedText;
+  }
+  struct ExpectedExpansion {
+    ExpectedExpansion(std::string From, std::string To,
+                      llvm::Optional<llvm::Range> Range = llvm::None)
+        : From(std::move(From)), To(std::move(To)), Range(Range) {}
+    /// A textual representation of the macro tokens.
+    std::string From;
+    /// A textual representation of the expansion result.
+    std::string To;
+    /// A text range the expansion points to.
+    llvm::Optional<llvm::Range> Range;
+  };
+  /// Checks the expansions in this->Buffer.macroExpansions() match the \p
+  /// Expected ones.
+  void checkExpansions(llvm::ArrayRef<ExpectedExpansion> Expected) {
+    auto Actual = Buffer.expansions();
+    ASSERT_EQ(Actual.size(), Expected.size());
+    for (unsigned I = 0; I < Actual.size(); ++I) {
+      auto &A = Actual[I];
+      auto &E = Expected[I];
+      if (E.Range)
+        ASSERT_EQ(A.macroRange(Buffer, *SourceMgr),
+                  (std::pair<unsigned, unsigned>(E.Range->Begin, E.Range->End)))
+            << "\trange does not match";
+          std::vector<syntax::Token>(A.macroTokens(Buffer)),
+          Pointwise(IsSameToken(), std::vector<syntax::Token>(
+                                       tokenize(E.From).tokens().drop_back())))
+          << "\tmacro tokens do not match, expected " << E.From;
+          std::vector<syntax::Token>(A.tokens(Buffer)),
+          Pointwise(IsSameToken(), std::vector<syntax::Token>(
+                                       tokenize(E.To).tokens().drop_back())))
+          << "\ttokens after expansion do not match, expected " << E.To;
+    }
+  }
+  // Specialized versions of matchers that rely on SourceManager.
+  Matcher<syntax::Token> IsIdent(std::string Text) const {
+    return ::IsIdent(Text, SourceMgr.get());
+  }
+  Matcher<syntax::Token> HasText(std::string Text) const {
+    return ::HasText(Text, SourceMgr.get());
+  }
+  Matcher<syntax::Token> RangeIs(llvm::Range R) const {
+    std::pair<SourceLocation, SourceLocation> Ls;
+    Ls.first = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID())
+                   .getLocWithOffset(R.Begin);
+    Ls.second = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID())
+                    .getLocWithOffset(R.End);
+    return ::RangeIs(Ls);
+  }
+  Matcher<std::tuple<const syntax::Token &, const syntax::Token &>>
+  IsSameToken() const {
+    return ::IsSameToken(SourceMgr.get());
+  }
+  // Data fields.
+  llvm::IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
+      new DiagnosticsEngine(new DiagnosticIDs, new DiagnosticOptions);
+  IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS =
+      new llvm::vfs::InMemoryFileSystem;
+  llvm::IntrusiveRefCntPtr<FileManager> FileMgr =
+      new FileManager(FileSystemOptions(), FS);
+  llvm::IntrusiveRefCntPtr<SourceManager> SourceMgr =
+      new SourceManager(*Diags, *FileMgr);
+  /// Contains last result of calling recordTokens().
+  TokenBuffer Buffer;
+  /// Contains options from last run of recordTokens().
+  LangOptions LangOpts;
+TEST_F(TokenBufferTest, RawMode) {
+  EXPECT_THAT(tokenize("int main() {}").tokens(),
+              ElementsAre(Kind(tok::kw_int), IsIdent("main"),
+                          Kind(tok::l_paren), Kind(tok::r_paren),
+                          Kind(tok::l_brace), Kind(tok::r_brace),
+                          Kind(tok::eof)));
+  // Comments are ignored for now.
+  EXPECT_THAT(tokenize("/* foo */int a; // more comments").tokens(),
+              ElementsAre(Kind(tok::kw_int), IsIdent("a"), Kind(tok::semi),
+                          Kind(tok::eof)));
+TEST_F(TokenBufferTest, Basic) {
+  recordTokens("int main() {}");
+  EXPECT_THAT(Buffer.tokens(),
+              ElementsAre(Kind(tok::kw_int), IsIdent("main"),
+                          Kind(tok::l_paren), Kind(tok::r_paren),
+                          Kind(tok::l_brace), Kind(tok::r_brace),
+                          Kind(tok::eof)));
+  // All kinds of whitespace are ignored.
+  recordTokens("\t\n  int\t\n  main\t\n  (\t\n  )\t\n{\t\n  }\t\n");
+  EXPECT_THAT(Buffer.tokens(),
+              ElementsAre(Kind(tok::kw_int), IsIdent("main"),
+                          Kind(tok::l_paren), Kind(tok::r_paren),
+                          Kind(tok::l_brace), Kind(tok::r_brace),
+                          Kind(tok::eof)));
+  llvm::Annotations Code(R"cpp(
+    $r1[[int]] $r2[[a]] $r3[[=]] $r4[["foo bar baz"]] $r5[[;]]
+  )cpp");
+  recordTokens(Code.code());
+      Buffer.tokens(),
+      ElementsAre(AllOf(Kind(tok::kw_int), RangeIs(Code.range("r1"))),
+                  AllOf(Kind(tok::identifier), RangeIs(Code.range("r2"))),
+                  AllOf(Kind(tok::equal), RangeIs(Code.range("r3"))),
+                  AllOf(Kind(tok::string_literal), RangeIs(Code.range("r4"))),
+                  AllOf(Kind(tok::semi), RangeIs(Code.range("r5"))),
+                  Kind(tok::eof)));
+TEST_F(TokenBufferTest, MacroDirectives) {
+  // Macro directives are not stored anywhere at the moment.
+  recordTokens(R"cpp(
+    #define FOO a
+    #include "unresolved_file.h"
+    #undef FOO
+    #ifdef X
+    #else
+    #endif
+    #ifndef Y
+    #endif
+    #if 1
+    #elif 2
+    #else
+    #endif
+    #pragma once
+    #pragma something lalala
+    int a;
+  )cpp");
+  checkTokens("int a;");
+  EXPECT_THAT(Buffer.expansions(), IsEmpty());
+  EXPECT_THAT(Buffer.macroTokens(), IsEmpty());
+TEST_F(TokenBufferTest, MacroExpansions) {
+  // A simple macro definition and expansion.
+  llvm::Annotations Code(R"cpp(
+    #define INT int const
+    [[INT]] a;
+    )cpp");
+  recordTokens(Code.code());
+  checkTokens("int const a;");
+  checkExpansions({{"INT", "int const", Code.range()}});
+  // A simple functional macro invocation.
+  Code = llvm::Annotations(R"cpp(
+    #define INT(a) const int
+    [[INT(10+10)]] a;
+    )cpp");
+  recordTokens(Code.code());
+  checkTokens("const int a;");
+  checkExpansions({{"INT(10+10)", "const int", Code.range()}});
+  // Recursive macro expansions.
+  Code = llvm::Annotations(R"cpp(
+    #define ID(X) X
+    #define INT int const
+    [[ID(ID(INT))]] a;
+  )cpp");
+  recordTokens(Code.code());
+  checkTokens("int const a;");
+  checkExpansions({{"ID(ID(INT))", "int const", Code.range()}});
+  // Empty macro expansions.
+  Code = llvm::Annotations(R"cpp(
+    #define EMPTY
+    #define EMPTY_FUNC(X)
+    $m[[EMPTY]]
+    $f[[EMPTY_FUNC(1+2+3)]]
+  )cpp");
+  recordTokens(Code.code());
+  checkTokens("");
+  checkExpansions({{"EMPTY", "", Code.range("m")},
+                   {"EMPTY_FUNC(1+2+3)", "", Code.range("f")}});
+TEST_F(TokenBufferTest, SpecialTokens) {
+  // Tokens coming from concatenations.
+  recordTokens(R"cpp(
+    #define CONCAT(a, b) a ## b
+    int a = CONCAT(1, 2);
+  )cpp");
+  checkTokens("int a = 12;");
+  // Multi-line tokens with slashes at the end.
+  recordTokens("i\\\nn\\\nt");
+  EXPECT_THAT(Buffer.tokens(),
+              ElementsAre(AllOf(Kind(tok::kw_int), HasText("i\\\nn\\\nt")),
+                          Kind(tok::eof)));
+  // FIXME: test tokens with digraphs and UCN identifiers.
+TEST_F(TokenBufferTest, LateBoundTokens) {
+  // The parser eventually breaks the first '>>' into two tokens ('>' and '>'),
+  // but we chooses to record them as a single token (for now).
+  llvm::Annotations Code(R"cpp(
+    template <class T>
+    struct foo { int a; };
+    int bar = foo<foo<int$br[[>>]]().a;
+    int baz = 10 $op[[>>]] 2;
+  )cpp");
+  recordTokens(Code.code());
+  EXPECT_THAT(std::vector<syntax::Token>(Buffer.tokens()),
+              AllOf(Contains(AllOf(Kind(tok::greatergreater),
+                                   RangeIs(Code.range("br")))),
+                    Contains(AllOf(Kind(tok::greatergreater),
+                                   RangeIs(Code.range("op"))))));
+TEST_F(TokenBufferTest, DelayedParsing) {
+  llvm::StringLiteral Code = R"cpp(
+    struct Foo {
+      int method() {
+        // Parser will visit method bodies and initializers multiple time, but
+        // TokenBuffer should only record the first walk over the tokens;
+        return 100;
+      }
+      int a = 10;
+      int b = 20;
+      struct Subclass {
+        void foo() {
+          Foo().method();
+        }
+      };
+    };
+  )cpp";
+  recordTokens(Code);
+  // Checks that lexing in raw mode produces the same results, hence we're not
+  // recording any tokens twice and the order is the same.
+  checkTokens(Code);
+TEST_F(TokenBufferTest, Offsets) {
+  llvm::Annotations Code("");
+  auto OfKind = [this](tok::TokenKind K) {
+    auto It = llvm::find_if(
+        Buffer.tokens(), [K](const syntax::Token &T) { return T.kind() == K; });
+    assert(It != Buffer.tokens().end());
+    return It;
+  };
+  auto Range = [&Code](llvm::StringRef Name) {
+    auto R = Code.range(Name);
+    return std::pair<unsigned, unsigned>(R.Begin, R.End);
+  };
+  Code = llvm::Annotations(R"cpp(
+    $all[[int $a[[a]] = $numbers[[100 + 200]];]]
+  )cpp");
+  recordTokens(Code.code());
+  EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::kw_int),
+                                 std::next(OfKind(tok::semi)), *SourceMgr),
+            Range("all"));
+  EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::identifier),
+                                 std::next(OfKind(tok::identifier)),
+                                 *SourceMgr),
+            Range("a"));
+  EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::numeric_constant),
+                                 OfKind(tok::semi), *SourceMgr),
+            Range("numbers"));
+  Code = llvm::Annotations(R"cpp(
+    #define ID(a) a
+    #define NUMBERS 100 + 200
+    $all[[ID(int) $a[[ID(a)]] = $numbers[[NUMBERS]];]]
+  )cpp");
+  recordTokens(Code.code());
+  EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::kw_int),
+                                 std::next(OfKind(tok::semi)), *SourceMgr),
+            Range("all"));
+  EXPECT_EQ(*Buffer.toOffsetRange(OfKind(tok::identifier),
+                                 std::next(OfKind(tok::identifier)),
+                                 *SourceMgr),
+            Range("a"));
+  EXPECT_EQ(*Buffer.toOffsetRange(OfKind(tok::numeric_constant),
+                                 OfKind(tok::semi), *SourceMgr),
+            Range("numbers"));
+  // Ranges not fully covering macro expansions should fail.
+  EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::numeric_constant),
+                                 std::next(OfKind(tok::numeric_constant)),
+                                 *SourceMgr),
+            llvm::None);
+  EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::plus),
+                                 std::next(OfKind(tok::plus)), *SourceMgr),
+            llvm::None);
+} // namespace
Index: clang/unittests/Tooling/Syntax/CMakeLists.txt
--- /dev/null
+++ clang/unittests/Tooling/Syntax/CMakeLists.txt
@@ -0,0 +1,20 @@
+  Support
+  )
+  TokenBufferTest.cpp
+  clangAST
+  clangBasic
+  clangFrontend
+  clangLex
+  clangSerialization
+  clangTooling
+  clangToolingSyntax
+  LLVMTestingSupport
+  )
Index: clang/unittests/Tooling/CMakeLists.txt
--- clang/unittests/Tooling/CMakeLists.txt
+++ clang/unittests/Tooling/CMakeLists.txt
@@ -67,3 +67,6 @@
Index: clang/lib/Tooling/Syntax/TokenBuffer.cpp
--- /dev/null
+++ clang/lib/Tooling/Syntax/TokenBuffer.cpp
@@ -0,0 +1,387 @@
+//===- TokenBuffer.cpp - store tokens of preprocessed files ---*- C++ -*-=====//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "clang/Tooling/Syntax/TokenBuffer.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.def"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <iterator>
+using namespace clang;
+using namespace clang::syntax;
+syntax::Token::Token(const clang::Token &T)
+    : Token(T.getLocation(), T.getLength(), T.getKind()) {
+  assert(!T.isAnnotation());
+llvm::StringRef syntax::Token::text(const SourceManager &SM) const {
+  bool Invalid = false;
+  const char *Start = SM.getCharacterData(location(), &Invalid);
+  assert(!Invalid);
+  return llvm::StringRef(Start, length());
+TokenBuffer syntax::tokenize(FileID FID, const SourceManager &SM,
+                             const LangOptions &LO) {
+  std::vector<syntax::Token> Tokens;
+  IdentifierTable Identifiers(LO);
+  auto AddToken = [&](clang::Token T) {
+    if (T.getKind() == tok::raw_identifier && !T.needsCleaning() &&
+        !T.hasUCN()) { // FIXME: support needsCleaning and hasUCN cases.
+      clang::IdentifierInfo &II = Identifiers.get(T.getRawIdentifier());
+      T.setIdentifierInfo(&II);
+      T.setKind(II.getTokenID());
+    }
+    Tokens.push_back(syntax::Token(T));
+  };
+  Lexer L(FID, SM.getBuffer(FID), SM, LO);
+  clang::Token T;
+  while (!L.LexFromRawLexer(T))
+    AddToken(T);
+  AddToken(T);
+  return TokenBuffer(std::move(Tokens));
+class TokenCollector::Callbacks : public PPCallbacks {
+  Callbacks(const SourceManager &SM, const LangOptions &LO, TokenBuffer &Result)
+      : Result(Result), SM(SM), LO(LO) {}
+  void FileChanged(SourceLocation Loc, FileChangeReason Reason,
+                   SrcMgr::CharacteristicKind FileType,
+                   FileID PrevFID) override {
+    assert(Loc.isFileID());
+    InsideMainFile = SM.getFileID(Loc) == SM.getMainFileID();
+    flushCurrentExpansion();
+  }
+  void MacroDefined(const clang::Token &MacroNameTok,
+                    const MacroDirective *MD) override {
+    flushCurrentExpansion();
+    handleMacroDirective(MacroNameTok.getLocation(), /*AnchorDiff=*/2);
+  }
+  void MacroUndefined(const clang::Token &MacroNameTok,
+                      const MacroDefinition &MD,
+                      const MacroDirective *Undef) override {
+    flushCurrentExpansion();
+    handleMacroDirective(MacroNameTok.getLocation(), /*AnchorDiff=*/2);
+  }
+  void InclusionDirective(SourceLocation HashLoc,
+                          const clang::Token &IncludeTok, StringRef FileName,
+                          bool IsAngled, CharSourceRange FilenameRange,
+                          const FileEntry *File, StringRef SearchPath,
+                          StringRef RelativePath, const Module *Imported,
+                          SrcMgr::CharacteristicKind FileType) override {
+    flushCurrentExpansion();
+    handleMacroDirective(IncludeTok.getLocation(), /*AnchorDiff=*/1);
+  }
+  void If(SourceLocation Loc, SourceRange ConditionRange,
+          ConditionValueKind ConditionValue) override {
+    flushCurrentExpansion();
+    handleMacroDirective(Loc, /*AnchorDiff=*/1);
+  }
+  void Elif(SourceLocation Loc, SourceRange ConditionRange,
+            ConditionValueKind ConditionValue, SourceLocation IfLoc) override {
+    flushCurrentExpansion();
+    handleMacroDirective(Loc, /*AnchorDiff=*/1);
+  }
+  void Ifdef(SourceLocation Loc, const clang::Token &MacroNameTok,
+             const MacroDefinition &MD) override {
+    flushCurrentExpansion();
+    handleMacroDirective(Loc, /*AnchorDiff=*/1);
+  }
+  void Ifndef(SourceLocation Loc, const clang::Token &MacroNameTok,
+              const MacroDefinition &MD) override {
+    flushCurrentExpansion();
+    handleMacroDirective(Loc, /*AnchorDiff=*/1);
+  }
+  void Else(SourceLocation Loc, SourceLocation IfLoc) override {
+    flushCurrentExpansion(Loc);
+    handleMacroDirective(Loc, /*AnchorDiff=*/1);
+  }
+  void Endif(SourceLocation Loc, SourceLocation IfLoc) override {
+    flushCurrentExpansion();
+    handleMacroDirective(Loc, /*AnchorDiff=*/1);
+  }
+  // FIXME: missing moduleImport(), Ident(), ...
+  void PragmaDirective(SourceLocation Loc,
+                       PragmaIntroducerKind Introducer) override {
+    if (!InsideMainFile)
+      return;
+    assert(PragmaStart.isInvalid() && "Recursive #pragma directives?");
+    PragmaStart = Loc;
+  }
+  void tokenLexed(const clang::Token &T) {
+    if (!InsideMainFile)
+      return;
+    auto L = T.getLocation();
+    assert(L.isValid());
+    // Parser sometimes goes through the same tokens again, we are only
+    // interested in the initial iteration.
+    if (!Result.Tokens.empty() &&
+        !SM.isBeforeInTranslationUnit(Result.Tokens.back().location(), L))
+      return;
+    flushCurrentExpansion(L);
+    if (ExpansionStart.isValid() && SM.getExpansionLoc(L) != ExpansionStart) {
+      // There are intermediate macro argument expansions. Skip them, they will
+      // be reported again later.
+      return;
+    }
+    DEBUG_WITH_TYPE("collect-tokens",
+                    llvm::dbgs() << llvm::formatv(
+                        "$[token], name - {0}, length - {1}, spelling - {2}\n",
+                        tok::getTokenName(T.getKind()), T.getLength(),
+                        Lexer::getSpelling(T, SM, LO)));
+    Result.Tokens.push_back(syntax::Token(T));
+    assert(Result.Tokens.back().location().isValid());
+    // Process the end of #pragma directive.
+    if (PragmaStart.isValid() && T.getKind() == tok::eod) {
+      handleMacroDirective(PragmaStart, /*AnchorDiff=*/0);
+      PragmaStart = SourceLocation();
+      return;
+    }
+  }
+  void MacroExpands(const clang::Token &MacroNameTok, const MacroDefinition &MD,
+                    SourceRange Range, const MacroArgs *Args) override {
+    if (!InsideMainFile)
+      return;
+    auto MacroNameLoc = MacroNameTok.getLocation();
+    flushCurrentExpansion(MacroNameLoc);
+    // Note that MacroNameTok was not reported yet.
+    auto ExpansionStart =
+        std::find_if(Result.Tokens.rbegin(), Result.Tokens.rend(),
+                     [&](const syntax::Token &T) {
+                       return SM.isBeforeInTranslationUnit(T.location(),
+                                                           MacroNameLoc);
+                     })
+            .base();
+    if (ExpansionFile.isValid()) {
+      // This is a recursive macro expansion, so we do not need to record it.
+      DEBUG_WITH_TYPE("collect-tokens",
+                      llvm::dbgs() << llvm::formatv(
+                          "$[macro-expands] dropping {0} macro tokens\n",
+                          std::distance(ExpansionStart, Result.Tokens.end())));
+      Result.Tokens.erase(ExpansionStart, Result.Tokens.end());
+      return;
+    }
+    // This is a new top-level macro expansion, record it.
+    MacroExpansion MC;
+    MC.BeginFileToken = Result.MacroTokens.size();
+    MC.EndFileToken =
+        MC.BeginFileToken + (Result.Tokens.end() - ExpansionStart) + 1;
+    // Store the macro name and macro arguments, they are used when calculating
+    // textual.
+    Result.MacroTokens.push_back(syntax::Token(MacroNameTok));
+    for (auto &T : llvm::make_range(ExpansionStart, Result.Tokens.end()))
+      Result.MacroTokens.push_back(T);
+    // Macro call tokens are not part of the expanded tokens, so remove them.
+    DEBUG_WITH_TYPE("collect-tokens",
+                    llvm::dbgs() << llvm::formatv(
+                        "$[macro-expands] dropping {0} macro tokens\n",
+                        std::distance(ExpansionStart, Result.Tokens.end())));
+    Result.Tokens.erase(ExpansionStart, Result.Tokens.end());
+    MC.BeginExpansionToken = Result.Tokens.size();
+    // MC.EndExpansionToken is filled after the expansion finishes.
+    Result.Expansions.push_back(MC);
+    // We need to record where expansion ends in order to track it properly.
+    std::tie(ExpansionFile, ExpansionEndOffset) =
+        SM.getDecomposedLoc(Range.getEnd());
+    this->ExpansionStart = Range.getBegin();
+  }
+  void handleMacroDirective(SourceLocation Anchor, int AnchorOffset) {
+    if (!InsideMainFile)
+      return;
+    flushCurrentExpansion(Anchor);
+    assert(!Result.Tokens.empty());
+    assert(Result.Tokens.back().kind() == tok::eod);
+    auto MacroStart = std::find_if(Result.Tokens.rbegin(), Result.Tokens.rend(),
+                                   [&](const syntax::Token &T) {
+                                     return T.location() == Anchor;
+                                   })
+                          .base();
+    // MacroStart now points a few tokens after the start of the macro, e.g.
+    //   # define MACRO ^...
+    //   # include ^...
+    // we want to move it to point to 'define' or 'include', respectively.
+    //   # ^define MACRO ...
+    //   # ^include ...
+    assert(std::distance(Result.Tokens.begin(), MacroStart) >= AnchorOffset);
+    std::advance(MacroStart, -AnchorOffset);
+    DEBUG_WITH_TYPE("collect-tokens",
+                    llvm::dbgs() << llvm::formatv(
+                        "$[pp-directive] dropping {0} macro directive tokens\n",
+                        std::distance(MacroStart, Result.Tokens.end())));
+    Result.Tokens.erase(MacroStart, Result.Tokens.end());
+  }
+  void flushCurrentExpansion() {
+    if (!ExpansionFile.isValid())
+      return;
+    assert(!Result.Expansions.empty());
+    assert(Result.Expansions.back().EndExpansionToken == 0);
+    Result.Expansions.back().EndExpansionToken = Result.Tokens.size();
+    ExpansionFile = FileID();
+    ExpansionStart = SourceLocation();
+    ExpansionEndOffset = 0;
+  }
+  void flushCurrentExpansion(SourceLocation L) {
+    assert(L.isValid());
+    if (!ExpansionFile.isValid())
+      return;
+    FileID File;
+    unsigned Offset;
+    std::tie(File, Offset) = SM.getDecomposedLoc(L);
+    if (File != ExpansionFile || Offset <= ExpansionEndOffset)
+      return;
+    // Check we are not inside the current macro arguments.
+    flushCurrentExpansion();
+  }
+  bool InsideMainFile = false;
+  // The start location of the currently processed #pragma directive.
+  SourceLocation PragmaStart;
+  /// When valid, the range of the last active top-level macro expansion.
+  FileID ExpansionFile;
+  SourceLocation ExpansionStart;
+  unsigned ExpansionEndOffset = 0;
+  TokenBuffer &Result;
+  const SourceManager &SM;
+  const LangOptions &LO;
+MacroExpansion::tokens(const TokenBuffer &B) const {
+  return B.tokens().slice(BeginExpansionToken,
+                          EndExpansionToken - BeginExpansionToken);
+MacroExpansion::macroTokens(const TokenBuffer &B) const {
+  return B.macroTokens().slice(BeginFileToken, EndFileToken - BeginFileToken);
+std::pair<unsigned, unsigned>
+MacroExpansion::macroRange(const TokenBuffer &B,
+                           const SourceManager &SM) const {
+  auto M = macroTokens(B);
+  return {SM.getFileOffset(M.front().location()),
+          SM.getFileOffset(M.back().endLocation())};
+TokenBuffer::TokenBuffer(std::vector<syntax::Token> Tokens)
+    : Tokens(std::move(Tokens)) {
+#ifndef NDEBUG
+  for (const auto &T : this->Tokens)
+    assert(T.location().isFileID());
+TokenCollector::TokenCollector(Preprocessor &PP) {
+  auto CBOwner = llvm::make_unique<Callbacks>(PP.getSourceManager(),
+                                              PP.getLangOpts(), Tokens);
+  auto *CB = CBOwner.get();
+  PP.addPPCallbacks(std::move(CBOwner));
+  PP.setTokenWatcher([CB](const clang::Token &T) { CB->tokenLexed(T); });
+TokenBuffer TokenCollector::consume() && { return std::move(Tokens); }
+llvm::Optional<std::pair<unsigned, unsigned>>
+TokenBuffer::toOffsetRange(const Token *Begin, const Token *End,
+                           const SourceManager &SM) const {
+  assert(Begin < End);
+  unsigned BeginIndex = Begin -;
+  unsigned EndIndex = End -;
+  // Find the first macro call that intersects with our range.
+  auto FirstCall =
+      std::upper_bound(Expansions.begin(), Expansions.end(), BeginIndex,
+                       [](unsigned L, const MacroExpansion &R) {
+                         return L < R.BeginExpansionToken;
+                       });
+  if (FirstCall != Expansions.begin()) {
+    --FirstCall;
+    if (FirstCall->EndExpansionToken <= BeginIndex)
+      FirstCall = Expansions.end();
+  } else {
+    FirstCall = Expansions.end();
+  }
+  // Find the last macro call that intersects with our range.
+  auto LastCall =
+      std::lower_bound(Expansions.begin(), Expansions.end(), EndIndex,
+                       [](const MacroExpansion &L, unsigned R) {
+                         return L.EndExpansionToken < R;
+                       });
+  if (LastCall != Expansions.end() && EndIndex <= LastCall->BeginExpansionToken)
+    LastCall = Expansions.end();
+  // Only allow changes that involve the whole macro calls, disallow anything
+  // that changes macros in between.
+  // FIXME: also allow changes uniquely mapping to macro arguments.
+  assert(FirstCall == Expansions.end() || LastCall == Expansions.end() ||
+         FirstCall <= LastCall);
+  // Check the first macro call is fully-covered.
+  if (FirstCall != Expansions.end() &&
+      (FirstCall->BeginExpansionToken < BeginIndex ||
+       EndIndex < FirstCall->EndExpansionToken)) {
+    return llvm::None;
+  }
+  // Check the last macro call is fully-covered.
+  if (LastCall != Expansions.end() &&
+      (LastCall->BeginExpansionToken < BeginIndex ||
+       EndIndex < LastCall->EndExpansionToken)) {
+    return llvm::None;
+  }
+  unsigned BeginOffset =
+      SM.getFileOffset(FirstCall != Expansions.end()
+                           ? FirstCall->macroTokens(*this).front().location()
+                           : Begin->location());
+  unsigned EndOffset =
+      SM.getFileOffset(LastCall != Expansions.end()
+                           ? LastCall->macroTokens(*this).back().endLocation()
+                           : std::prev(End)->endLocation());
+  return std::make_pair(BeginOffset, EndOffset);
Index: clang/lib/Tooling/Syntax/CMakeLists.txt
--- /dev/null
+++ clang/lib/Tooling/Syntax/CMakeLists.txt
@@ -0,0 +1,10 @@
+  TokenBuffer.cpp
+  clangBasic
+  clangFrontend
+  clangLex
+  )
Index: clang/lib/Tooling/CMakeLists.txt
--- clang/lib/Tooling/CMakeLists.txt
+++ clang/lib/Tooling/CMakeLists.txt
@@ -7,6 +7,7 @@
Index: clang/include/clang/Tooling/Syntax/TokenBuffer.h
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/TokenBuffer.h
@@ -0,0 +1,185 @@
+//===- TokenBuffer.h - store tokens of preprocessed files -----*- C++ -*-=====//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.def"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+namespace clang {
+class Preprocessor;
+namespace syntax {
+class TokenBuffer;
+/// A token coming directly from a file or a macro expansion. Has just enough
+/// information to locate the token in the source code.
+class Token {
+  Token() = default;
+  Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind)
+      : Location(Location), Length(Length), Kind(Kind) {}
+  /// EXPECTS: clang::Token is not an annotation token.
+  explicit Token(const clang::Token &T);
+  tok::TokenKind kind() const { return Kind; }
+  SourceLocation location() const { return Location; }
+  SourceLocation endLocation() const {
+    return Location.getLocWithOffset(Length);
+  }
+  unsigned length() const { return Length; }
+  /// Get the substring covered by the token. Note that will include all
+  /// digraphs, newline continuations, etc. E.g. 'int' and
+  ///    in\
+  ///    t
+  /// both same kind tok::kw_int, but results of getText are different.
+  llvm::StringRef text(const SourceManager &SM) const;
+  SourceLocation Location;
+  unsigned Length = 0;
+  tok::TokenKind Kind = tok::NUM_TOKENS;
+static_assert(sizeof(Token) <= 16, "Token is unresonably large");
+/// A top-level macro expansion inside a file.
+class MacroExpansion {
+  /// The tokens obtained after expansion.
+  llvm::ArrayRef<syntax::Token> tokens(const TokenBuffer &B) const;
+  /// These cover the name and arguments of a macro (if any), incluging the
+  /// parentheses around macro arguments.
+  llvm::ArrayRef<syntax::Token> macroTokens(const TokenBuffer &B) const;
+  /// Range of offsets covering the name of a macro expansion or the name and
+  /// arguments of a functional macro invocation.
+  std::pair<unsigned, unsigned> macroRange(const TokenBuffer &B,
+                                           const SourceManager &SM) const;
+  friend class TokenCollector;
+  friend class TokenBuffer;
+  unsigned BeginExpansionToken = 0;
+  unsigned EndExpansionToken = 0;
+  unsigned BeginFileToken = 0;
+  unsigned EndFileToken = 0;
+/// A list of tokens obtained by lexing and preprocessing a text buffer and a
+/// set of helpers to allow mapping the tokens after preprocessing to the
+/// corresponding code written in a file. TokenBuffer has information about two
+/// token streams:
+///    1. tokens produced by the preprocessor, i.e. after all macro expansions,
+///    2. pre-expansion tokens that corresponds to the source code of a file.
+/// The tokens for (1) are stored directly and can be accessed with the tokens()
+/// method. However, some of these tokens may come from macro expansions and so
+/// they don't correspond directly to any text in a file, e.g.
+///     #define FOO 10
+///     int a = FOO;  // no token '10' in the file, just 'FOO'
+/// For these tokens, TokenBuffer allows to obtain the macro name and macro
+/// arguments that were used to produce the expansion with the 'toOffsetRange()'
+/// method.
+/// There are two ways to build a TokenBuffer:
+///   1. If you are running a clang frontend invocation, use the TokenCollector
+///      class,
+///   2. if you only need to lex a file, use the tokenize() helper.
+class TokenBuffer {
+  TokenBuffer() = default;
+  // Assumes no macro expansions have taken place.
+  TokenBuffer(std::vector<syntax::Token> Tokens);
+  /// All tokens from the result of preprocessor expansion, i.e. the list of
+  /// tokens produced by the preprocessor. Source locations in the clang AST
+  /// should always point into any of these nodes.
+  llvm::ArrayRef<syntax::Token> tokens() const { return Tokens; }
+  /// Attempt the map a range of expanded tokens into a continuous substring of
+  /// the original source file. The tranformation may not be possible if the
+  /// range requires changing the macro expansions.
+  llvm::Optional<std::pair<unsigned, unsigned>>
+  toOffsetRange(const Token *Begin, const Token *End,
+                const SourceManager &SM) const;
+  /// All top-level macro expansions from the corresponding file. Includes
+  /// functional macro invocation and expansion of macro identifiers. E.g would
+  /// contain 3 entries for the following code:
+  ///     #define FOO 2*5
+  ///     #define BAR(a,b) a+b+FOO
+  ///     BAR(FOO, FOO) // #1
+  ///     int a = FOO; // #2
+  ///     int b = BAR(a, BAR(6, FOO)); // #3
+  /// Note that neither expansions inside macro arguments (e.g. 'FOO' in
+  /// 'BAR(FOO, FOO)') nor recursive macro expansions are present in the
+  /// result.
+  llvm::ArrayRef<MacroExpansion> expansions() const { return Expansions; }
+  /// Tokens of macro directives and top-level macro expansions. These are not
+  /// part of the expanded token stream, but they fill the gaps for the file.
+  /// Here is an example:
+  ///     #define DECL(name) int name = 10
+  ///     DECL(a);
+  /// For the input above, we would get tokens() = {"int", "a", "=", "10", ";"}
+  /// and macroTokens() = {"DECL", "(", "a", ")"}.
+  llvm::ArrayRef<syntax::Token> macroTokens() const { return MacroTokens; }
+  friend class TokenCollector;
+  friend class MacroExpansion;
+  /// Expanded tokens, the ASTs are built on top of these. Some of the tokens
+  /// have file locations and can be used to obtain the file offsets directly.
+  std::vector<syntax::Token> Tokens;
+  /// Tokens forming top-level macro expansions, i.e. all macro names and macro
+  /// arguments.
+  std::vector<syntax::Token> MacroTokens;
+  /// A list of top-level macro expansions inside a particular file.
+  std::vector<MacroExpansion> Expansions;
+/// Lex the text buffer, corresponding to \p FID, in raw mode and record the
+/// resulting tokens. Does minimal post-processing on raw identifiers, setting
+/// their corresponding token kind. This is a very low-level function, most
+/// users should prefer to use TokenCollector. Lexing in raw mode produces
+/// wildly different results from what one might expect when running a C++
+/// frontend, e.g. preprocessor does not run at all.
+TokenBuffer tokenize(FileID FID, const SourceManager &SM,
+                     const LangOptions &LO);
+/// Collects tokens for the main file while running the frontend action. An
+/// instance of this object should be created on
+/// FrontendAction::BeginSourceFile() and the results should be consumed after
+/// FrontendAction::Execute() finishes.
+class TokenCollector {
+  /// Adds the hooks to collect the tokens. Should be called before the
+  /// preprocessing starts, i.e. as a part of BeginSourceFile() or
+  /// CreateASTConsumer().
+  TokenCollector(Preprocessor &P);
+  /// Consumes the result. Should be called after preprocessing is finished,
+  /// i.e. after running Execute().
+  TokenBuffer consume() &&;
+  class Callbacks;
+  TokenBuffer Tokens;
+} // namespace syntax
+} // namespace clang
