[PATCH] D125487: [Tooling/DependencyScanning] Refactor dependency scanning to produce pre-lexed preprocessor directive tokens, instead of minimized sources

Argyrios Kyrtzidis via Phabricator via cfe-commits Thu, 12 May 2022 11:03:37 -0700

akyrtzi created this revision.
Herald added a project: All.
akyrtzi requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.


Depends on D125486 <https://reviews.llvm.org/D125486>

This is 3/4 of a series of patches for making the special lexing for dependency 
scanning a first-class feature of the `Preprocessor` and `Lexer`.
This patch replaces the "source minimization" mechanism with a mechanism that 
produces lexed dependency directives tokens.

These directive tokens are not yet used by the preprocessor during dependency 
scanning, this will come in the following patch in the series.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D125487

Files:
  clang/include/clang/Lex/DependencyDirectivesScanner.h
  clang/include/clang/Lex/Lexer.h
  clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
  clang/lib/Frontend/FrontendActions.cpp
  clang/lib/Lex/DependencyDirectivesScanner.cpp
  clang/lib/Lex/Lexer.cpp
  clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
  clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c
  clang/test/Lexer/minimize_source_to_dependency_directives_pragmas.c
  clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
  clang/unittests/Tooling/DependencyScannerTest.cpp

Index: clang/unittests/Tooling/DependencyScannerTest.cpp
===================================================================
--- clang/unittests/Tooling/DependencyScannerTest.cpp
+++ clang/unittests/Tooling/DependencyScannerTest.cpp
@@ -204,51 +204,5 @@
   EXPECT_EQ(convert_to_slash(Deps[5]), "/root/symlink.h");
 }
 
-namespace dependencies {
-TEST(DependencyScanningFilesystem, IgnoredFilesAreCachedSeparately1) {
-  auto VFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
-  VFS->addFile("/mod.h", 0,
-               llvm::MemoryBuffer::getMemBuffer("#include <foo.h>\n"
-                                                "// hi there!\n"));
-
-  DependencyScanningFilesystemSharedCache SharedCache;
-  DependencyScanningWorkerFilesystem DepFS(SharedCache, VFS);
-
-  DepFS.enableDirectivesScanningOfAllFiles(); // Let's be explicit for clarity.
-  auto StatusMinimized0 = DepFS.status("/mod.h");
-  DepFS.disableDirectivesScanning("/mod.h");
-  auto StatusFull1 = DepFS.status("/mod.h");
-
-  EXPECT_TRUE(StatusMinimized0);
-  EXPECT_TRUE(StatusFull1);
-  EXPECT_EQ(StatusMinimized0->getSize(), 17u);
-  EXPECT_EQ(StatusFull1->getSize(), 30u);
-  EXPECT_EQ(StatusMinimized0->getName(), StringRef("/mod.h"));
-  EXPECT_EQ(StatusFull1->getName(), StringRef("/mod.h"));
-}
-
-TEST(DependencyScanningFilesystem, IgnoredFilesAreCachedSeparately2) {
-  auto VFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
-  VFS->addFile("/mod.h", 0,
-               llvm::MemoryBuffer::getMemBuffer("#include <foo.h>\n"
-                                                "// hi there!\n"));
-
-  DependencyScanningFilesystemSharedCache SharedCache;
-  DependencyScanningWorkerFilesystem DepFS(SharedCache, VFS);
-
-  DepFS.disableDirectivesScanning("/mod.h");
-  auto StatusFull0 = DepFS.status("/mod.h");
-  DepFS.enableDirectivesScanningOfAllFiles();
-  auto StatusMinimized1 = DepFS.status("/mod.h");
-
-  EXPECT_TRUE(StatusFull0);
-  EXPECT_TRUE(StatusMinimized1);
-  EXPECT_EQ(StatusFull0->getSize(), 30u);
-  EXPECT_EQ(StatusMinimized1->getSize(), 17u);
-  EXPECT_EQ(StatusFull0->getName(), StringRef("/mod.h"));
-  EXPECT_EQ(StatusMinimized1->getName(), StringRef("/mod.h"));
-}
-
-} // end namespace dependencies
 } // end namespace tooling
 } // end namespace clang
Index: clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
===================================================================
--- clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
+++ clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
@@ -14,39 +14,58 @@
 using namespace clang;
 using namespace clang::dependency_directives_scan;
 
-static bool minimizeSourceToDependencyDirectives(StringRef Input,
-                                                 SmallVectorImpl<char> &Out) {
-  SmallVector<dependency_directives_scan::Directive, 32> Directives;
-  return scanSourceForDependencyDirectives(Input, Out, Directives);
+static bool minimizeSourceToDependencyDirectives(
+    StringRef Input, SmallVectorImpl<char> &Out,
+    SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
+    SmallVectorImpl<Directive> &Directives) {
+  Out.clear();
+  Tokens.clear();
+  Directives.clear();
+  if (scanSourceForDependencyDirectives(Input, Tokens, Directives))
+    return true;
+
+  raw_svector_ostream OS(Out);
+  printDependencyDirectivesAsSource(Input, Directives, OS);
+  if (!Out.empty() && Out.back() != '\n')
+    Out.push_back('\n');
+  Out.push_back('\0');
+  Out.pop_back();
+
+  return false;
 }
 
-static bool
-minimizeSourceToDependencyDirectives(StringRef Input,
-                                     SmallVectorImpl<char> &Out,
-                                     SmallVectorImpl<Directive> &Directives) {
-  return scanSourceForDependencyDirectives(Input, Out, Directives);
+static bool minimizeSourceToDependencyDirectives(StringRef Input,
+                                                 SmallVectorImpl<char> &Out) {
+  SmallVector<dependency_directives_scan::Token, 16> Tokens;
+  SmallVector<Directive, 32> Directives;
+  return minimizeSourceToDependencyDirectives(Input, Out, Tokens, Directives);
 }
 
 namespace {
 
 TEST(MinimizeSourceToDependencyDirectivesTest, Empty) {
   SmallVector<char, 128> Out;
+  SmallVector<dependency_directives_scan::Token, 4> Tokens;
   SmallVector<Directive, 4> Directives;
 
-  ASSERT_FALSE(minimizeSourceToDependencyDirectives("", Out, Directives));
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("", Out, Tokens, Directives));
   EXPECT_TRUE(Out.empty());
+  EXPECT_TRUE(Tokens.empty());
   ASSERT_EQ(1u, Directives.size());
   ASSERT_EQ(pp_eof, Directives.back().Kind);
 
-  ASSERT_FALSE(
-      minimizeSourceToDependencyDirectives("abc def\nxyz", Out, Directives));
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("abc def\nxyz", Out, Tokens,
+                                                    Directives));
   EXPECT_TRUE(Out.empty());
+  EXPECT_TRUE(Tokens.empty());
   ASSERT_EQ(1u, Directives.size());
   ASSERT_EQ(pp_eof, Directives.back().Kind);
 }
 
-TEST(MinimizeSourceToDependencyDirectivesTest, AllDirectives) {
+TEST(MinimizeSourceToDependencyDirectivesTest, AllTokens) {
   SmallVector<char, 128> Out;
+  SmallVector<dependency_directives_scan::Token, 4> Tokens;
   SmallVector<Directive, 4> Directives;
 
   ASSERT_FALSE(
@@ -71,7 +90,7 @@
                                            "#pragma include_alias(<A>, <B>)\n"
                                            "export module m;\n"
                                            "import m;\n",
-                                           Out, Directives));
+                                           Out, Tokens, Directives));
   EXPECT_EQ(pp_define, Directives[0].Kind);
   EXPECT_EQ(pp_undef, Directives[1].Kind);
   EXPECT_EQ(pp_endif, Directives[2].Kind);
@@ -91,19 +110,20 @@
   EXPECT_EQ(pp_pragma_push_macro, Directives[16].Kind);
   EXPECT_EQ(pp_pragma_pop_macro, Directives[17].Kind);
   EXPECT_EQ(pp_pragma_include_alias, Directives[18].Kind);
-  EXPECT_EQ(cxx_export_decl, Directives[19].Kind);
-  EXPECT_EQ(cxx_module_decl, Directives[20].Kind);
-  EXPECT_EQ(cxx_import_decl, Directives[21].Kind);
-  EXPECT_EQ(pp_eof, Directives[22].Kind);
+  EXPECT_EQ(cxx_export_module_decl, Directives[19].Kind);
+  EXPECT_EQ(cxx_import_decl, Directives[20].Kind);
+  EXPECT_EQ(pp_eof, Directives[21].Kind);
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, Define) {
   SmallVector<char, 128> Out;
+  SmallVector<dependency_directives_scan::Token, 4> Tokens;
   SmallVector<Directive, 4> Directives;
 
-  ASSERT_FALSE(
-      minimizeSourceToDependencyDirectives("#define MACRO", Out, Directives));
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO", Out,
+                                                    Tokens, Directives));
   EXPECT_STREQ("#define MACRO\n", Out.data());
+  ASSERT_EQ(4u, Tokens.size());
   ASSERT_EQ(2u, Directives.size());
   ASSERT_EQ(pp_define, Directives.front().Kind);
 }
@@ -144,25 +164,25 @@
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       "#define MACRO   con  tent   ", Out));
-  EXPECT_STREQ("#define MACRO con  tent\n", Out.data());
+  EXPECT_STREQ("#define MACRO con tent\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       "#define MACRO()   con  tent   ", Out));
-  EXPECT_STREQ("#define MACRO() con  tent\n", Out.data());
+  EXPECT_STREQ("#define MACRO() con tent\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, DefineInvalidMacroArguments) {
   SmallVector<char, 128> Out;
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO((a))", Out));
-  EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data());
+  EXPECT_STREQ("#define MACRO((a))\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO(", Out));
-  EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data());
+  EXPECT_STREQ("#define MACRO(\n", Out.data());
 
   ASSERT_FALSE(
       minimizeSourceToDependencyDirectives("#define MACRO(a * b)", Out));
-  EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data());
+  EXPECT_STREQ("#define MACRO(a*b)\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, DefineHorizontalWhitespace) {
@@ -170,19 +190,19 @@
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       "#define MACRO(\t)\tcon \t tent\t", Out));
-  EXPECT_STREQ("#define MACRO() con \t tent\n", Out.data());
+  EXPECT_STREQ("#define MACRO() con tent\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       "#define MACRO(\f)\fcon \f tent\f", Out));
-  EXPECT_STREQ("#define MACRO() con \f tent\n", Out.data());
+  EXPECT_STREQ("#define MACRO() con tent\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       "#define MACRO(\v)\vcon \v tent\v", Out));
-  EXPECT_STREQ("#define MACRO() con \v tent\n", Out.data());
+  EXPECT_STREQ("#define MACRO() con tent\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       "#define MACRO \t\v\f\v\t con\f\t\vtent\v\f \v", Out));
-  EXPECT_STREQ("#define MACRO con\f\t\vtent\n", Out.data());
+  EXPECT_STREQ("#define MACRO con tent\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, DefineMultilineArgs) {
@@ -255,25 +275,27 @@
 TEST(MinimizeSourceToDependencyDirectivesTest, DefineNumber) {
   SmallVector<char, 128> Out;
 
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("#define 0\n", Out));
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define 0\n", Out));
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoName) {
   SmallVector<char, 128> Out;
 
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("#define &\n", Out));
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define &\n", Out));
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoWhitespace) {
   SmallVector<char, 128> Out;
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND&\n", Out));
-  EXPECT_STREQ("#define AND &\n", Out.data());
+  EXPECT_STREQ("#define AND&\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND\\\n"
                                                     "&\n",
                                                     Out));
-  EXPECT_STREQ("#define AND &\n", Out.data());
+  EXPECT_STREQ("#define AND\\\n"
+               "&\n",
+               Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, MultilineComment) {
@@ -481,6 +503,9 @@
   ASSERT_FALSE(
       minimizeSourceToDependencyDirectives("#__include_macros <A>\n", Out));
   EXPECT_STREQ("#__include_macros <A>\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#include MACRO\n", Out));
+  EXPECT_STREQ("#include MACRO\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, AtImport) {
@@ -507,8 +532,9 @@
   SmallVector<char, 128> Out;
 
   ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import A\n", Out));
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import MACRO(A);\n", Out));
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import \" \";\n", Out));
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("@import MACRO(A);\n", Out));
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import \" \";\n", Out));
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, RawStringLiteral) {
@@ -559,7 +585,8 @@
                                                     "#define GUARD\n"
                                                     "#endif\n",
                                                     Out));
-  EXPECT_STREQ("#ifndef GUARD\n"
+  EXPECT_STREQ("#if\\\n"
+               "ndef GUARD\n"
                "#define GUARD\n"
                "#endif\n",
                Out.data());
@@ -567,12 +594,16 @@
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\n"
                                                     "RD\n",
                                                     Out));
-  EXPECT_STREQ("#define GUARD\n", Out.data());
+  EXPECT_STREQ("#define GUA\\\n"
+               "RD\n",
+               Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\r"
                                                     "RD\n",
                                                     Out));
-  EXPECT_STREQ("#define GUARD\n", Out.data());
+  EXPECT_STREQ("#define GUA\\\r"
+               "RD\n",
+               Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\n"
                                                     "           RD\n",
@@ -588,7 +619,10 @@
                                                     "2 + \\\t\n"
                                                     "3\n",
                                                     Out));
-  EXPECT_STREQ("#define A 1 + 2 + 3\n", Out.data());
+  EXPECT_STREQ("#define A 1+\\  \n"
+               "2+\\\t\n"
+               "3\n",
+               Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, PoundWarningAndError) {
@@ -682,6 +716,7 @@
 
 TEST(MinimizeSourceToDependencyDirectivesTest, PragmaOnce) {
   SmallVector<char, 128> Out;
+  SmallVector<dependency_directives_scan::Token, 4> Tokens;
   SmallVector<Directive, 4> Directives;
 
   StringRef Source = R"(// comment
@@ -689,7 +724,8 @@
 // another comment
 #include <test.h>
 )";
-  ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out, Directives));
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives(Source, Out, Tokens, Directives));
   EXPECT_STREQ("#pragma once\n#include <test.h>\n", Out.data());
   ASSERT_EQ(Directives.size(), 3u);
   EXPECT_EQ(Directives[0].Kind, dependency_directives_scan::pp_pragma_once);
@@ -700,7 +736,7 @@
     #include <test.h>
     )";
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
-  EXPECT_STREQ("#pragma once\n#include <test.h>\n", Out.data());
+  EXPECT_STREQ("#pragma once extra tokens\n#include <test.h>\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest,
@@ -755,11 +791,12 @@
 
   Source = "#define X \"\\ \r\nx\n#include <x>\n";
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
-  EXPECT_STREQ("#define X \"\\ \r\nx\n#include <x>\n", Out.data());
+  EXPECT_STREQ("#define X\"\\ \r\nx\n#include <x>\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, CxxModules) {
   SmallVector<char, 128> Out;
+  SmallVector<dependency_directives_scan::Token, 4> Tokens;
   SmallVector<Directive, 4> Directives;
 
   StringRef Source = R"(
@@ -789,16 +826,17 @@
       import f(->a = 3);
     }
     )";
-  ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out, Directives));
-  EXPECT_STREQ("#include \"textual-header.h\"\nexport module m;\n"
-               "export import :l [[rename]];\n"
-               "import <<= 3;\nimport a b d e d e f e;\n"
-               "import foo [[no_unique_address]];\nimport foo();\n"
-               "import f(:sefse);\nimport f(->a = 3);\n",
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives(Source, Out, Tokens, Directives));
+  EXPECT_STREQ("#include \"textual-header.h\"\nexport module m;"
+               "exp\\\nort import:l[[rename]];"
+               "import<<=3;import a b d e d e f e;"
+               "import foo[[no_unique_address]];import foo();"
+               "import f(:sefse);import f(->a=3);\n",
                Out.data());
-  ASSERT_EQ(Directives.size(), 12u);
-  EXPECT_EQ(Directives[0].Kind, dependency_directives_scan::pp_include);
-  EXPECT_EQ(Directives[2].Kind, dependency_directives_scan::cxx_module_decl);
+  ASSERT_EQ(Directives.size(), 10u);
+  EXPECT_EQ(Directives[0].Kind, pp_include);
+  EXPECT_EQ(Directives[1].Kind, cxx_export_module_decl);
 }
 
 } // end anonymous namespace
Index: clang/test/Lexer/minimize_source_to_dependency_directives_pragmas.c
===================================================================
--- clang/test/Lexer/minimize_source_to_dependency_directives_pragmas.c
+++ clang/test/Lexer/minimize_source_to_dependency_directives_pragmas.c
@@ -15,7 +15,7 @@
 #pragma include_alias(<string>,   "mystring.h")
 
 // CHECK:      #pragma once
-// CHECK-NEXT: #pragma push_macro( "MYMACRO" )
+// CHECK-NEXT: #pragma push_macro("MYMACRO")
 // CHECK-NEXT: #pragma pop_macro("MYMACRO")
 // CHECK-NEXT: #pragma clang module import mymodule
 // CHECK-NEXT: #pragma include_alias(<string>, "mystring.h")
Index: clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c
===================================================================
--- clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c
+++ clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c
@@ -1,3 +1,4 @@
-// RUN: %clang_cc1 -verify -print-dependency-directives-minimized-source %s 2>&1
+// RUN: %clang_cc1 -print-dependency-directives-minimized-source %s 2>&1 | FileCheck %s
 
-#define 0 0 // expected-error {{macro name must be an identifier}}
+#define 0 0
+// CHECK: #define 0 0
Index: clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
===================================================================
--- clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
+++ clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h"
-#include "clang/Lex/DependencyDirectivesScanner.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include "llvm/Support/Threading.h"
@@ -45,47 +44,40 @@
     const CachedFileSystemEntry &Entry, StringRef Filename, bool Disable) {
   if (Entry.isError() || Entry.isDirectory() || Disable ||
       !shouldScanForDirectives(Filename, Entry.getUniqueID()))
-    return EntryRef(/*Minimized=*/false, Filename, Entry);
+    return EntryRef(Filename, Entry);
 
   CachedFileContents *Contents = Entry.getCachedContents();
   assert(Contents && "contents not initialized");
 
   // Double-checked locking.
-  if (Contents->MinimizedAccess.load())
-    return EntryRef(/*Minimized=*/true, Filename, Entry);
+  if (Contents->DepDirectives.load())
+    return EntryRef(Filename, Entry);
 
   std::lock_guard<std::mutex> GuardLock(Contents->ValueLock);
 
   // Double-checked locking.
-  if (Contents->MinimizedAccess.load())
-    return EntryRef(/*Minimized=*/true, Filename, Entry);
+  if (Contents->DepDirectives.load())
+    return EntryRef(Filename, Entry);
 
-  llvm::SmallString<1024> MinimizedFileContents;
-  // Minimize the file down to directives that might affect the dependencies.
-  SmallVector<dependency_directives_scan::Directive, 64> Tokens;
+  SmallVector<dependency_directives_scan::Directive, 64> Directives;
+  // Scan the file for preprocessor directives that might affect the
+  // dependencies.
   if (scanSourceForDependencyDirectives(Contents->Original->getBuffer(),
-                                        MinimizedFileContents, Tokens)) {
+                                        Contents->DepDirectiveTokens,
+                                        Directives)) {
+    Contents->DepDirectiveTokens.clear();
     // FIXME: Propagate the diagnostic if desired by the client.
-    // Use the original file if the minimization failed.
-    Contents->MinimizedStorage =
-        llvm::MemoryBuffer::getMemBuffer(*Contents->Original);
-    Contents->MinimizedAccess.store(Contents->MinimizedStorage.get());
-    return EntryRef(/*Minimized=*/true, Filename, Entry);
+    Contents->DepDirectives.store(new Optional<DependencyDirectivesTy>());
+    return EntryRef(Filename, Entry);
   }
 
-  // The contents produced by the minimizer must be null terminated.
-  assert(MinimizedFileContents.data()[MinimizedFileContents.size()] == '\0' &&
-         "not null terminated contents");
-
-  Contents->MinimizedStorage = std::make_unique<llvm::SmallVectorMemoryBuffer>(
-      std::move(MinimizedFileContents));
-  // This function performed double-checked locking using `MinimizedAccess`.
-  // Assigning it must be the last thing this function does. If we were to
-  // assign it before `PPSkippedRangeMapping`, other threads may skip the
-  // critical section (`MinimizedAccess != nullptr`) and access the mappings
-  // that are about to be initialized, leading to a data race.
-  Contents->MinimizedAccess.store(Contents->MinimizedStorage.get());
-  return EntryRef(/*Minimized=*/true, Filename, Entry);
+  // This function performed double-checked locking using `DepDirectives`.
+  // Assigning it must be the last thing this function does, otherwise other
+  // threads may skip the
+  // critical section (`DepDirectives != nullptr`), leading to a data race.
+  Contents->DepDirectives.store(
+      new Optional<DependencyDirectivesTy>(std::move(Directives)));
+  return EntryRef(Filename, Entry);
 }
 
 DependencyScanningFilesystemSharedCache::
Index: clang/lib/Lex/Lexer.cpp
===================================================================
--- clang/lib/Lex/Lexer.cpp
+++ clang/lib/Lex/Lexer.cpp
@@ -226,13 +226,11 @@
   return L;
 }
 
-bool Lexer::skipOver(unsigned NumBytes) {
-  IsAtPhysicalStartOfLine = true;
-  IsAtStartOfLine = true;
-  if ((BufferPtr + NumBytes) > BufferEnd)
-    return true;
-  BufferPtr += NumBytes;
-  return false;
+void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
+  this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
+  this->IsAtStartOfLine = IsAtStartOfLine;
+  assert((BufferStart + Offset) <= BufferEnd);
+  BufferPtr = BufferStart + Offset;
 }
 
 template <typename T> static void StringifyImpl(T &Str, char Quote) {
Index: clang/lib/Lex/DependencyDirectivesScanner.cpp
===================================================================
--- clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -18,92 +18,136 @@
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Lex/LexDiagnostic.h"
+#include "clang/Lex/Lexer.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/MemoryBuffer.h"
 
-using namespace llvm;
 using namespace clang;
 using namespace clang::dependency_directives_scan;
+using namespace llvm;
 
 namespace {
 
-struct Scanner {
-  /// Minimized output.
-  SmallVectorImpl<char> &Out;
-  /// The known tokens encountered during the minimization.
-  SmallVectorImpl<Directive> &Directives;
+struct DirectiveWithTokens {
+  DirectiveKind Kind;
+  unsigned NumTokens;
+
+  DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens)
+      : Kind(Kind), NumTokens(NumTokens) {}
+};
 
-  Scanner(SmallVectorImpl<char> &Out, SmallVectorImpl<Directive> &Directives,
-          StringRef Input, DiagnosticsEngine *Diags,
-          SourceLocation InputSourceLoc)
-      : Out(Out), Directives(Directives), Input(Input), Diags(Diags),
-        InputSourceLoc(InputSourceLoc) {}
+/// Does an efficient "scan" of the sources to detect the presence of
+/// preprocessor (or module import) directives and collects the raw lexed tokens
+/// for those directives so that the \p Lexer can "replay" them when the file is
+/// included.
+///
+/// Note that the behavior of the raw lexer is affected by the language mode,
+/// while at this point we want to do a scan and collect tokens once,
+/// irrespective of the language mode that the file will get included in. To
+/// compensate for that the \p Lexer, while "replaying", will adjust a token
+/// where appropriate, when it could affect the preprocessor's state.
+/// For example in a directive like
+///
+/// \code
+///   #if __has_cpp_attribute(clang::fallthrough)
+/// \endcode
+///
+/// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2
+/// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon'
+/// while in C++ mode.
+struct Scanner {
+  Scanner(StringRef Input,
+          SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
+          DiagnosticsEngine *Diags, SourceLocation InputSourceLoc)
+      : Input(Input), Tokens(Tokens), Diags(Diags),
+        InputSourceLoc(InputSourceLoc),
+        TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(),
+                 Input.end()) {
+    // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
+    LangOpts.ObjC = true;
+  }
 
   /// Lex the provided source and emit the directive tokens.
   ///
   /// \returns True on error.
-  bool scan();
+  bool scan(SmallVectorImpl<Directive> &Directives);
 
 private:
-  struct IdInfo {
-    const char *Last;
-    StringRef Name;
-  };
+  /// Lexes next token and advances \p First and the \p Lexer.
+  LLVM_NODISCARD dependency_directives_scan::Token &
+  lexToken(const char *&First, const char *const End);
+
+  dependency_directives_scan::Token &lexIncludeFilename(const char *&First,
+                                                        const char *const End);
+
+  /// Lexes next token and if it is identifier returns its string, otherwise
+  /// returns \p None.
+  ///
+  /// In any case (whatever the token kind) \p First and the \p Lexer will
+  /// advance beyond the token.
+  LLVM_NODISCARD Optional<StringRef> tryLexIdentifier(const char *&First,
+                                                      const char *const End);
+
+  /// Used when it is certain that next token is an identifier.
+  LLVM_NODISCARD StringRef lexIdentifier(const char *&First,
+                                         const char *const End);
 
-  /// Lex an identifier.
+  /// Lexes next token and returns true iff it is an identifier that matches \p
+  /// Id.
   ///
-  /// \pre First points at a valid identifier head.
-  LLVM_NODISCARD IdInfo lexIdentifier(const char *First, const char *const End);
+  /// In any case (whatever the token kind) \p First and the \p Lexer will
+  /// advance beyond the token.
   LLVM_NODISCARD bool isNextIdentifier(StringRef Id, const char *&First,
                                        const char *const End);
+
   LLVM_NODISCARD bool scanImpl(const char *First, const char *const End);
   LLVM_NODISCARD bool lexPPLine(const char *&First, const char *const End);
   LLVM_NODISCARD bool lexAt(const char *&First, const char *const End);
   LLVM_NODISCARD bool lexModule(const char *&First, const char *const End);
-  LLVM_NODISCARD bool lexDefine(const char *&First, const char *const End);
+  LLVM_NODISCARD bool lexDefine(const char *HashLoc, const char *&First,
+                                const char *const End);
   LLVM_NODISCARD bool lexPragma(const char *&First, const char *const End);
   LLVM_NODISCARD bool lexEndif(const char *&First, const char *const End);
-  LLVM_NODISCARD bool lexDefault(DirectiveKind Kind, StringRef Directive,
-                                 const char *&First, const char *const End);
-  Directive &pushDirective(DirectiveKind K) {
-    Directives.emplace_back(K, Out.size());
-    return Directives.back();
+  LLVM_NODISCARD bool lexDefault(DirectiveKind Kind, const char *&First,
+                                 const char *const End);
+  LLVM_NODISCARD bool lexModuleDirectiveBody(DirectiveKind Kind,
+                                             const char *&First,
+                                             const char *const End);
+  void lexPPDirectiveBody(const char *&First, const char *const End);
+
+  DirectiveWithTokens &pushDirective(DirectiveKind Kind) {
+    Tokens.append(CurDirToks);
+    DirsWithToks.emplace_back(Kind, CurDirToks.size());
+    CurDirToks.clear();
+    return DirsWithToks.back();
   }
   void popDirective() {
-    Out.resize(Directives.back().Offset);
-    Directives.pop_back();
+    Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens);
   }
   DirectiveKind topDirective() const {
-    return Directives.empty() ? pp_none : Directives.back().Kind;
+    return DirsWithToks.empty() ? pp_none : DirsWithToks.back().Kind;
   }
 
-  Scanner &put(char Byte) {
-    Out.push_back(Byte);
-    return *this;
-  }
-  Scanner &append(StringRef S) { return append(S.begin(), S.end()); }
-  Scanner &append(const char *First, const char *Last) {
-    Out.append(First, Last);
-    return *this;
+  unsigned getOffsetAt(const char *CurPtr) const {
+    return CurPtr - Input.data();
   }
 
-  void printToNewline(const char *&First, const char *const End);
-  void printAdjacentModuleNameParts(const char *&First, const char *const End);
-  LLVM_NODISCARD bool printAtImportBody(const char *&First,
-                                        const char *const End);
-  void printDirectiveBody(const char *&First, const char *const End);
-  void printAdjacentMacroArgs(const char *&First, const char *const End);
-  LLVM_NODISCARD bool printMacroArgs(const char *&First, const char *const End);
-
   /// Reports a diagnostic if the diagnostic engine is provided. Always returns
   /// true at the end.
   bool reportError(const char *CurPtr, unsigned Err);
 
   StringMap<char> SplitIds;
   StringRef Input;
+  SmallVectorImpl<dependency_directives_scan::Token> &Tokens;
   DiagnosticsEngine *Diags;
   SourceLocation InputSourceLoc;
+
+  SmallVector<dependency_directives_scan::Token, 32> CurDirToks;
+  SmallVector<DirectiveWithTokens, 64> DirsWithToks;
+  LangOptions LangOpts;
+  Lexer TheLexer;
 };
 
 } // end anonymous namespace
@@ -112,7 +156,7 @@
   if (!Diags)
     return true;
   assert(CurPtr >= Input.data() && "invalid buffer ptr");
-  Diags->Report(InputSourceLoc.getLocWithOffset(CurPtr - Input.data()), Err);
+  Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err);
   return true;
 }
 
@@ -265,30 +309,6 @@
   }
 }
 
-static const char *findLastNonSpace(const char *First, const char *Last) {
-  assert(First <= Last);
-  while (First != Last && isHorizontalWhitespace(Last[-1]))
-    --Last;
-  return Last;
-}
-
-static const char *findLastNonSpaceNonBackslash(const char *First,
-                                                const char *Last) {
-  assert(First <= Last);
-  while (First != Last &&
-         (isHorizontalWhitespace(Last[-1]) || Last[-1] == '\\'))
-    --Last;
-  return Last;
-}
-
-static const char *findFirstTrailingSpace(const char *First, const char *Last) {
-  const char *LastNonSpace = findLastNonSpace(First, Last);
-  if (Last == LastNonSpace)
-    return Last;
-  assert(isHorizontalWhitespace(LastNonSpace[0]));
-  return LastNonSpace + 1;
-}
-
 static void skipLineComment(const char *&First, const char *const End) {
   assert(First[0] == '/' && First[1] == '/');
   First += 2;
@@ -396,67 +416,6 @@
     skipLine(First, End);
 }
 
-void Scanner::printToNewline(const char *&First, const char *const End) {
-  while (First != End && !isVerticalWhitespace(*First)) {
-    const char *Last = First;
-    do {
-      // Iterate over strings correctly to avoid comments and newlines.
-      if (*Last == '"' || *Last == '\'' ||
-          (*Last == '<' &&
-           (topDirective() == pp_include || topDirective() == pp_import))) {
-        if (LLVM_UNLIKELY(isRawStringLiteral(First, Last)))
-          skipRawString(Last, End);
-        else
-          skipString(Last, End);
-        continue;
-      }
-      if (*Last != '/' || End - Last < 2) {
-        ++Last;
-        continue; // Gather the rest up to print verbatim.
-      }
-
-      if (Last[1] != '/' && Last[1] != '*') {
-        ++Last;
-        continue;
-      }
-
-      // Deal with "//..." and "/*...*/".
-      append(First, findFirstTrailingSpace(First, Last));
-      First = Last;
-
-      if (Last[1] == '/') {
-        skipLineComment(First, End);
-        return;
-      }
-
-      put(' ');
-      skipBlockComment(First, End);
-      skipOverSpaces(First, End);
-      Last = First;
-    } while (Last != End && !isVerticalWhitespace(*Last));
-
-    // Print out the string.
-    const char *LastBeforeTrailingSpace = findLastNonSpace(First, Last);
-    if (Last == End || LastBeforeTrailingSpace == First ||
-        LastBeforeTrailingSpace[-1] != '\\') {
-      append(First, LastBeforeTrailingSpace);
-      First = Last;
-      skipNewline(First, End);
-      return;
-    }
-
-    // Print up to the last character that's not a whitespace or backslash.
-    // Then print exactly one space, which matters when tokens are separated by
-    // a line continuation.
-    append(First, findLastNonSpaceNonBackslash(First, Last));
-    put(' ');
-
-    First = Last;
-    skipNewline(First, End);
-    skipOverSpaces(First, End);
-  }
-}
-
 static void skipWhitespace(const char *&First, const char *const End) {
   for (;;) {
     assert(First <= End);
@@ -489,176 +448,133 @@
   }
 }
 
-void Scanner::printAdjacentModuleNameParts(const char *&First,
-                                           const char *const End) {
-  // Skip over parts of the body.
-  const char *Last = First;
-  do
-    ++Last;
-  while (Last != End && (isAsciiIdentifierContinue(*Last) || *Last == '.'));
-  append(First, Last);
-  First = Last;
-}
-
-bool Scanner::printAtImportBody(const char *&First, const char *const End) {
+bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First,
+                                     const char *const End) {
+  const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset;
   for (;;) {
-    skipWhitespace(First, End);
-    if (First == End)
-      return true;
-
-    if (isVerticalWhitespace(*First)) {
-      skipNewline(First, End);
-      continue;
-    }
-
-    // Found a semicolon.
-    if (*First == ';') {
-      put(*First++).put('\n');
-      return false;
-    }
-
-    // Don't handle macro expansions inside @import for now.
-    if (!isAsciiIdentifierContinue(*First) && *First != '.')
-      return true;
-
-    printAdjacentModuleNameParts(First, End);
+    const dependency_directives_scan::Token &Tok = lexToken(First, End);
+    if (Tok.is(tok::eof))
+      return reportError(
+          DirectiveLoc,
+          diag::err_dep_source_scanner_missing_semi_after_at_import);
+    if (Tok.is(tok::semi))
+      break;
   }
+  pushDirective(Kind);
+  skipWhitespace(First, End);
+  if (First == End)
+    return false;
+  if (!isVerticalWhitespace(*First))
+    return reportError(
+        DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import);
+  skipNewline(First, End);
+  return false;
 }
 
-void Scanner::printDirectiveBody(const char *&First, const char *const End) {
-  skipWhitespace(First, End); // Skip initial whitespace.
-  printToNewline(First, End);
-  while (Out.back() == ' ')
-    Out.pop_back();
-  put('\n');
-}
+dependency_directives_scan::Token &Scanner::lexToken(const char *&First,
+                                                     const char *const End) {
+  clang::Token Tok;
+  TheLexer.LexFromRawLexer(Tok);
+  First = Input.data() + TheLexer.getCurrentBufferOffset();
+  assert(First <= End);
 
-LLVM_NODISCARD static const char *lexRawIdentifier(const char *First,
-                                                   const char *const End) {
-  assert(isAsciiIdentifierContinue(*First) && "invalid identifer");
-  const char *Last = First + 1;
-  while (Last != End && isAsciiIdentifierContinue(*Last))
-    ++Last;
-  return Last;
+  unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
+  CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
+                          Tok.getFlags());
+  return CurDirToks.back();
 }
 
-LLVM_NODISCARD static const char *
-getIdentifierContinuation(const char *First, const char *const End) {
-  if (End - First < 3 || First[0] != '\\' || !isVerticalWhitespace(First[1]))
-    return nullptr;
+dependency_directives_scan::Token &
+Scanner::lexIncludeFilename(const char *&First, const char *const End) {
+  clang::Token Tok;
+  TheLexer.LexIncludeFilename(Tok);
+  First = Input.data() + TheLexer.getCurrentBufferOffset();
+  assert(First <= End);
 
-  ++First;
-  skipNewline(First, End);
-  if (First == End)
-    return nullptr;
-  return isAsciiIdentifierContinue(First[0]) ? First : nullptr;
+  unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
+  CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
+                          Tok.getFlags());
+  return CurDirToks.back();
 }
 
-Scanner::IdInfo Scanner::lexIdentifier(const char *First,
-                                       const char *const End) {
-  const char *Last = lexRawIdentifier(First, End);
-  const char *Next = getIdentifierContinuation(Last, End);
-  if (LLVM_LIKELY(!Next))
-    return IdInfo{Last, StringRef(First, Last - First)};
-
-  // Slow path, where identifiers are split over lines.
-  SmallVector<char, 64> Id(First, Last);
-  while (Next) {
-    Last = lexRawIdentifier(Next, End);
-    Id.append(Next, Last);
-    Next = getIdentifierContinuation(Last, End);
+void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
+  while (true) {
+    const dependency_directives_scan::Token &Tok = lexToken(First, End);
+    if (Tok.is(tok::eod))
+      break;
   }
-  return IdInfo{
-      Last,
-      SplitIds.try_emplace(StringRef(Id.begin(), Id.size()), 0).first->first()};
-}
-
-void Scanner::printAdjacentMacroArgs(const char *&First,
-                                     const char *const End) {
-  // Skip over parts of the body.
-  const char *Last = First;
-  do
-    ++Last;
-  while (Last != End &&
-         (isAsciiIdentifierContinue(*Last) || *Last == '.' || *Last == ','));
-  append(First, Last);
-  First = Last;
 }
 
-bool Scanner::printMacroArgs(const char *&First, const char *const End) {
-  assert(*First == '(');
-  put(*First++);
-  for (;;) {
-    skipWhitespace(First, End);
-    if (First == End)
-      return true;
+LLVM_NODISCARD Optional<StringRef>
+Scanner::tryLexIdentifier(const char *&First, const char *const End) {
+  const dependency_directives_scan::Token &Tok = lexToken(First, End);
+  if (Tok.isNot(tok::raw_identifier))
+    return None;
 
-    if (*First == ')') {
-      put(*First++);
-      return false;
-    }
+  bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
+  if (LLVM_LIKELY(!NeedsCleaning))
+    return Input.slice(Tok.Offset, Tok.getEnd());
 
-    // This is intentionally fairly liberal.
-    if (!(isAsciiIdentifierContinue(*First) || *First == '.' || *First == ','))
-      return true;
+  SmallString<64> Spelling;
+  Spelling.resize(Tok.Length);
 
-    printAdjacentMacroArgs(First, End);
+  unsigned SpellingLength = 0;
+  const char *BufPtr = Input.begin() + Tok.Offset;
+  const char *AfterIdent = Input.begin() + Tok.getEnd();
+  while (BufPtr < AfterIdent) {
+    unsigned Size;
+    Spelling[SpellingLength++] =
+        Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+    BufPtr += Size;
   }
+
+  return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
+      .first->first();
+}
+
+StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
+  Optional<StringRef> Id = tryLexIdentifier(First, End);
+  assert(Id.hasValue() && "expected identifier token");
+  return Id.getValue();
 }
 
-/// Looks for an identifier starting from Last.
-///
-/// Updates "First" to just past the next identifier, if any.  Returns true iff
-/// the identifier matches "Id".
 bool Scanner::isNextIdentifier(StringRef Id, const char *&First,
                                const char *const End) {
-  skipWhitespace(First, End);
-  if (First == End || !isAsciiIdentifierStart(*First))
-    return false;
-
-  IdInfo FoundId = lexIdentifier(First, End);
-  First = FoundId.Last;
-  return FoundId.Name == Id;
+  if (Optional<StringRef> FoundId = tryLexIdentifier(First, End)) {
+    return *FoundId == Id;
+  }
+  return false;
 }
 
 bool Scanner::lexAt(const char *&First, const char *const End) {
   // Handle "@import".
-  const char *ImportLoc = First++;
+
+  // Lex '@'.
+  const dependency_directives_scan::Token &AtTok = lexToken(First, End);
+  assert(AtTok.is(tok::at));
+  (void)AtTok;
+
   if (!isNextIdentifier("import", First, End)) {
     skipLine(First, End);
     return false;
   }
-  pushDirective(decl_at_import);
-  append("@import ");
-  if (printAtImportBody(First, End))
-    return reportError(
-        ImportLoc, diag::err_dep_source_scanner_missing_semi_after_at_import);
-  skipWhitespace(First, End);
-  if (First == End)
-    return false;
-  if (!isVerticalWhitespace(*First))
-    return reportError(
-        ImportLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import);
-  skipNewline(First, End);
-  return false;
+  return lexModuleDirectiveBody(decl_at_import, First, End);
 }
 
 bool Scanner::lexModule(const char *&First, const char *const End) {
-  IdInfo Id = lexIdentifier(First, End);
-  First = Id.Last;
+  StringRef Id = lexIdentifier(First, End);
   bool Export = false;
-  if (Id.Name == "export") {
+  if (Id == "export") {
     Export = true;
-    skipWhitespace(First, End);
-    if (!isAsciiIdentifierContinue(*First)) {
+    Optional<StringRef> NextId = tryLexIdentifier(First, End);
+    if (!NextId) {
       skipLine(First, End);
       return false;
     }
-    Id = lexIdentifier(First, End);
-    First = Id.Last;
+    Id = *NextId;
   }
 
-  if (Id.Name != "module" && Id.Name != "import") {
+  if (Id != "module" && Id != "import") {
     skipLine(First, End);
     return false;
   }
@@ -680,94 +596,36 @@
     }
   }
 
-  if (Export) {
-    pushDirective(cxx_export_decl);
-    append("export ");
-  }
+  TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false);
 
-  if (Id.Name == "module")
-    pushDirective(cxx_module_decl);
+  DirectiveKind Kind;
+  if (Id == "module")
+    Kind = Export ? cxx_export_module_decl : cxx_module_decl;
   else
-    pushDirective(cxx_import_decl);
-  append(Id.Name);
-  append(" ");
-  printToNewline(First, End);
-  append("\n");
-  return false;
-}
-
-bool Scanner::lexDefine(const char *&First, const char *const End) {
-  pushDirective(pp_define);
-  append("#define ");
-  skipWhitespace(First, End);
+    Kind = Export ? cxx_export_import_decl : cxx_import_decl;
 
-  if (!isAsciiIdentifierStart(*First))
-    return reportError(First, diag::err_pp_macro_not_identifier);
-
-  IdInfo Id = lexIdentifier(First, End);
-  const char *Last = Id.Last;
-  append(Id.Name);
-  if (Last == End)
-    return false;
-  if (*Last == '(') {
-    size_t Size = Out.size();
-    if (printMacroArgs(Last, End)) {
-      // Be robust to bad macro arguments, since they can show up in disabled
-      // code.
-      Out.resize(Size);
-      append("(/* invalid */\n");
-      skipLine(Last, End);
-      return false;
-    }
-  }
-  skipWhitespace(Last, End);
-  if (Last == End)
-    return false;
-  if (!isVerticalWhitespace(*Last))
-    put(' ');
-  printDirectiveBody(Last, End);
-  First = Last;
-  return false;
+  return lexModuleDirectiveBody(Kind, First, End);
 }
 
 bool Scanner::lexPragma(const char *&First, const char *const End) {
-  // #pragma.
-  skipWhitespace(First, End);
-  if (First == End || !isAsciiIdentifierStart(*First))
+  Optional<StringRef> FoundId = tryLexIdentifier(First, End);
+  if (!FoundId)
     return false;
 
-  IdInfo FoundId = lexIdentifier(First, End);
-  First = FoundId.Last;
-  if (FoundId.Name == "once") {
-    // #pragma once
-    skipLine(First, End);
-    pushDirective(pp_pragma_once);
-    append("#pragma once\n");
-    return false;
-  }
-  if (FoundId.Name == "push_macro") {
-    // #pragma push_macro
-    pushDirective(pp_pragma_push_macro);
-    append("#pragma push_macro");
-    printDirectiveBody(First, End);
-    return false;
-  }
-  if (FoundId.Name == "pop_macro") {
-    // #pragma pop_macro
-    pushDirective(pp_pragma_pop_macro);
-    append("#pragma pop_macro");
-    printDirectiveBody(First, End);
-    return false;
-  }
-  if (FoundId.Name == "include_alias") {
-    // #pragma include_alias
-    pushDirective(pp_pragma_include_alias);
-    append("#pragma include_alias");
-    printDirectiveBody(First, End);
+  StringRef Id = FoundId.getValue();
+  auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
+                  .Case("once", pp_pragma_once)
+                  .Case("push_macro", pp_pragma_push_macro)
+                  .Case("pop_macro", pp_pragma_pop_macro)
+                  .Case("include_alias", pp_pragma_include_alias)
+                  .Default(pp_none);
+  if (Kind != pp_none) {
+    lexPPDirectiveBody(First, End);
+    pushDirective(Kind);
     return false;
   }
 
-  if (FoundId.Name != "clang") {
+  if (Id != "clang") {
     skipLine(First, End);
     return false;
   }
@@ -785,9 +643,8 @@
   }
 
   // #pragma clang module import.
+  lexPPDirectiveBody(First, End);
   pushDirective(pp_pragma_import);
-  append("#pragma clang module import ");
-  printDirectiveBody(First, End);
   return false;
 }
 
@@ -808,14 +665,13 @@
     return false;
   }
 
-  return lexDefault(pp_endif, "endif", First, End);
+  return lexDefault(pp_endif, First, End);
 }
 
-bool Scanner::lexDefault(DirectiveKind Kind, StringRef Directive,
-                         const char *&First, const char *const End) {
+bool Scanner::lexDefault(DirectiveKind Kind, const char *&First,
+                         const char *const End) {
+  lexPPDirectiveBody(First, End);
   pushDirective(Kind);
-  put('#').append(Directive).put(' ');
-  printDirectiveBody(First, End);
   return false;
 }
 
@@ -845,6 +701,14 @@
     return false;
   }
 
+  TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true);
+
+  auto ScEx1 = make_scope_exit([&]() {
+    /// Clear Scanner's CurDirToks before returning, in case we didn't push a
+    /// new directive.
+    CurDirToks.clear();
+  });
+
   // Handle "@import".
   if (*First == '@')
     return lexAt(First, End);
@@ -853,25 +717,29 @@
     return lexModule(First, End);
 
   // Handle preprocessing directives.
-  ++First; // Skip over '#'.
-  skipWhitespace(First, End);
 
-  if (First == End)
-    return reportError(First, diag::err_pp_expected_eol);
+  TheLexer.setParsingPreprocessorDirective(true);
+  auto ScEx2 = make_scope_exit(
+      [&]() { TheLexer.setParsingPreprocessorDirective(false); });
+
+  // Lex '#'.
+  const dependency_directives_scan::Token &HashTok = lexToken(First, End);
+  assert(HashTok.is(tok::hash));
+  (void)HashTok;
+
+  Optional<StringRef> FoundId = tryLexIdentifier(First, End);
 
-  if (!isAsciiIdentifierStart(*First)) {
+  if (!FoundId) {
     skipLine(First, End);
     return false;
   }
 
-  // Figure out the token.
-  IdInfo Id = lexIdentifier(First, End);
-  First = Id.Last;
+  StringRef Id = FoundId.getValue();
 
-  if (Id.Name == "pragma")
+  if (Id == "pragma")
     return lexPragma(First, End);
 
-  auto Kind = llvm::StringSwitch<DirectiveKind>(Id.Name)
+  auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
                   .Case("include", pp_include)
                   .Case("__include_macros", pp___include_macros)
                   .Case("define", pp_define)
@@ -888,18 +756,26 @@
                   .Case("endif", pp_endif)
                   .Default(pp_none);
   if (Kind == pp_none) {
-    skipDirective(Id.Name, First, End);
+    skipDirective(Id, First, End);
     return false;
   }
 
   if (Kind == pp_endif)
     return lexEndif(First, End);
 
-  if (Kind == pp_define)
-    return lexDefine(First, End);
+  switch (Kind) {
+  case pp_include:
+  case pp___include_macros:
+  case pp_include_next:
+  case pp_import:
+    lexIncludeFilename(First, End);
+    break;
+  default:
+    break;
+  }
 
   // Everything else.
-  return lexDefault(Kind, Id.Name, First, End);
+  return lexDefault(Kind, First, End);
 }
 
 static void skipUTF8ByteOrderMark(const char *&First, const char *const End) {
@@ -916,28 +792,65 @@
   return false;
 }
 
-bool Scanner::scan() {
+bool Scanner::scan(SmallVectorImpl<Directive> &Directives) {
   bool Error = scanImpl(Input.begin(), Input.end());
 
   if (!Error) {
-    // Add a trailing newline and an EOF on success.
-    if (!Out.empty() && Out.back() != '\n')
-      Out.push_back('\n');
+    // Add an EOF on success.
     pushDirective(pp_eof);
   }
 
-  // Null-terminate the output. This way the memory buffer that's passed to
-  // Clang will not have to worry about the terminating '\0'.
-  Out.push_back(0);
-  Out.pop_back();
+  ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens;
+  for (const DirectiveWithTokens &DirWithToks : DirsWithToks) {
+    assert(RemainingTokens.size() >= DirWithToks.NumTokens);
+    Directives.emplace_back(DirWithToks.Kind,
+                            RemainingTokens.take_front(DirWithToks.NumTokens));
+    RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens);
+  }
+  assert(RemainingTokens.empty());
+
   return Error;
 }
 
 bool clang::scanSourceForDependencyDirectives(
-    StringRef Input, SmallVectorImpl<char> &Output,
+    StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
     SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags,
     SourceLocation InputSourceLoc) {
-  Output.clear();
-  Directives.clear();
-  return Scanner(Output, Directives, Input, Diags, InputSourceLoc).scan();
+  return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives);
+}
+
+void clang::printDependencyDirectivesAsSource(
+    StringRef Source,
+    ArrayRef<dependency_directives_scan::Directive> Directives,
+    llvm::raw_ostream &OS) {
+  // Add a space separator where it is convenient for testing purposes.
+  auto needsSpaceSeparator =
+      [](tok::TokenKind Prev,
+         const dependency_directives_scan::Token &Tok) -> bool {
+    if (Prev == Tok.Kind)
+      return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square,
+                          tok::r_square);
+    if (Prev == tok::raw_identifier &&
+        Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal,
+                    tok::char_constant, tok::header_name))
+      return true;
+    if (Prev == tok::r_paren &&
+        Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal,
+                    tok::char_constant, tok::unknown))
+      return true;
+    if (Prev == tok::comma &&
+        Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less))
+      return true;
+    return false;
+  };
+
+  for (const dependency_directives_scan::Directive &Directive : Directives) {
+    Optional<tok::TokenKind> PrevTokenKind;
+    for (const dependency_directives_scan::Token &Tok : Directive.Tokens) {
+      if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok))
+        OS << ' ';
+      PrevTokenKind = Tok.Kind;
+      OS << Source.slice(Tok.Offset, Tok.getEnd());
+    }
+  }
 }
Index: clang/lib/Frontend/FrontendActions.cpp
===================================================================
--- clang/lib/Frontend/FrontendActions.cpp
+++ clang/lib/Frontend/FrontendActions.cpp
@@ -1157,10 +1157,10 @@
   SourceManager &SM = CI.getPreprocessor().getSourceManager();
   llvm::MemoryBufferRef FromFile = SM.getBufferOrFake(SM.getMainFileID());
 
-  llvm::SmallString<1024> Output;
+  llvm::SmallVector<dependency_directives_scan::Token, 16> Tokens;
   llvm::SmallVector<dependency_directives_scan::Directive, 32> Directives;
   if (scanSourceForDependencyDirectives(
-          FromFile.getBuffer(), Output, Directives, &CI.getDiagnostics(),
+          FromFile.getBuffer(), Tokens, Directives, &CI.getDiagnostics(),
           SM.getLocForStartOfFile(SM.getMainFileID()))) {
     assert(CI.getDiagnostics().hasErrorOccurred() &&
            "no errors reported for failure");
@@ -1179,7 +1179,8 @@
     }
     return;
   }
-  llvm::outs() << Output;
+  printDependencyDirectivesAsSource(FromFile.getBuffer(), Directives,
+                                    llvm::outs());
 }
 
 void GetDependenciesByModuleNameAction::ExecuteAction() {
Index: clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
===================================================================
--- clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
+++ clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGFILESYSTEM_H
 
 #include "clang/Basic/LLVM.h"
+#include "clang/Lex/DependencyDirectivesScanner.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Allocator.h"
@@ -21,21 +22,26 @@
 namespace tooling {
 namespace dependencies {
 
-/// Original and minimized contents of a cached file entry. Single instance can
+using DependencyDirectivesTy =
+    SmallVector<dependency_directives_scan::Directive, 20>;
+
+/// Contents and directive tokens of a cached file entry. Single instance can
 /// be shared between multiple entries.
 struct CachedFileContents {
-  CachedFileContents(std::unique_ptr<llvm::MemoryBuffer> Original)
-      : Original(std::move(Original)), MinimizedAccess(nullptr) {}
+  CachedFileContents(std::unique_ptr<llvm::MemoryBuffer> Contents)
+      : Original(std::move(Contents)), DepDirectives(nullptr) {}
 
   /// Owning storage for the original contents.
   std::unique_ptr<llvm::MemoryBuffer> Original;
 
   /// The mutex that must be locked before mutating directive tokens.
   std::mutex ValueLock;
-  /// Owning storage for the minimized contents.
-  std::unique_ptr<llvm::MemoryBuffer> MinimizedStorage;
+  SmallVector<dependency_directives_scan::Token, 10> DepDirectiveTokens;
   /// Accessor to the directive tokens that's atomic to avoid data races.
-  std::atomic<llvm::MemoryBuffer *> MinimizedAccess;
+  /// \p CachedFileContents has ownership of the pointer.
+  std::atomic<const Optional<DependencyDirectivesTy> *> DepDirectives;
+
+  ~CachedFileContents() { delete DepDirectives.load(); }
 };
 
 /// An in-memory representation of a file system entity that is of interest to
@@ -82,13 +88,17 @@
 
   /// \returns The scanned preprocessor directive tokens of the file that are
   /// used to speed up preprocessing, if available.
-  StringRef getDirectiveTokens() const {
+  Optional<ArrayRef<dependency_directives_scan::Directive>>
+  getDirectiveTokens() const {
     assert(!isError() && "error");
-    assert(!MaybeStat->isDirectory() && "not a file");
+    assert(!isDirectory() && "not a file");
     assert(Contents && "contents not initialized");
-    llvm::MemoryBuffer *Buffer = Contents->MinimizedAccess.load();
-    assert(Buffer && "not minimized");
-    return Buffer->getBuffer();
+    if (auto *Directives = Contents->DepDirectives.load()) {
+      if (Directives->hasValue())
+        return ArrayRef<dependency_directives_scan::Directive>(
+            Directives->getValue());
+    }
+    return None;
   }
 
   /// \returns The error.
@@ -224,10 +234,6 @@
 /// If the underlying entry is an opened file, this wrapper returns the file
 /// contents and the scanned preprocessor directives.
 class EntryRef {
-  /// For entry that is an opened file, this bit signifies whether its contents
-  /// are minimized.
-  bool Minimized;
-
   /// The filename used to access this entry.
   std::string Filename;
 
@@ -235,8 +241,8 @@
   const CachedFileSystemEntry &Entry;
 
 public:
-  EntryRef(bool Minimized, StringRef Name, const CachedFileSystemEntry &Entry)
-      : Minimized(Minimized), Filename(Name), Entry(Entry) {}
+  EntryRef(StringRef Name, const CachedFileSystemEntry &Entry)
+      : Filename(Name), Entry(Entry) {}
 
   llvm::vfs::Status getStatus() const {
     llvm::vfs::Status Stat = Entry.getStatus();
@@ -255,8 +261,11 @@
     return *this;
   }
 
-  StringRef getContents() const {
-    return Minimized ? Entry.getDirectiveTokens() : Entry.getOriginalContents();
+  StringRef getContents() const { return Entry.getOriginalContents(); }
+
+  Optional<ArrayRef<dependency_directives_scan::Directive>>
+  getDirectiveTokens() const {
+    return Entry.getDirectiveTokens();
   }
 };
 
Index: clang/include/clang/Lex/Lexer.h
===================================================================
--- clang/include/clang/Lex/Lexer.h
+++ clang/include/clang/Lex/Lexer.h
@@ -288,14 +288,8 @@
     return BufferPtr - BufferStart;
   }
 
-  /// Skip over \p NumBytes bytes.
-  ///
-  /// If the skip is successful, the next token will be lexed from the new
-  /// offset. The lexer also assumes that we skipped to the start of the line.
-  ///
-  /// \returns true if the skip failed (new offset would have been past the
-  /// end of the buffer), false otherwise.
-  bool skipOver(unsigned NumBytes);
+  /// Set the lexer's buffer pointer to \p Offset.
+  void seek(unsigned Offset, bool IsAtStartOfLine);
 
   /// Stringify - Convert the specified string into a C string by i) escaping
   /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
Index: clang/include/clang/Lex/DependencyDirectivesScanner.h
===================================================================
--- clang/include/clang/Lex/DependencyDirectivesScanner.h
+++ clang/include/clang/Lex/DependencyDirectivesScanner.h
@@ -19,15 +19,41 @@
 
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 
 namespace clang {
 
+namespace tok {
+enum TokenKind : unsigned short;
+}
+
 class DiagnosticsEngine;
 
 namespace dependency_directives_scan {
 
+/// Token lexed as part of dependency directive scanning.
+struct Token {
+  /// Offset into the original source input.
+  unsigned Offset;
+  unsigned Length;
+  tok::TokenKind Kind;
+  unsigned short Flags;
+
+  Token(unsigned Offset, unsigned Length, tok::TokenKind Kind,
+        unsigned short Flags)
+      : Offset(Offset), Length(Length), Kind(Kind), Flags(Flags) {}
+
+  unsigned getEnd() const { return Offset + Length; }
+
+  bool is(tok::TokenKind K) const { return Kind == K; }
+  bool isNot(tok::TokenKind K) const { return Kind != K; }
+  bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const {
+    return is(K1) || is(K2);
+  }
+  template <typename... Ts> bool isOneOf(tok::TokenKind K1, Ts... Ks) const {
+    return is(K1) || isOneOf(Ks...);
+  }
+};
+
 /// Represents the kind of preprocessor directive or a module declaration that
 /// is tracked by the scanner in its token output.
 enum DirectiveKind : uint8_t {
@@ -52,9 +78,10 @@
   pp_else,
   pp_endif,
   decl_at_import,
-  cxx_export_decl,
   cxx_module_decl,
   cxx_import_decl,
+  cxx_export_module_decl,
+  cxx_export_import_decl,
   pp_eof,
 };
 
@@ -62,35 +89,48 @@
 /// scanning. It's used to track various preprocessor directives that could
 /// potentially have an effect on the depedencies.
 struct Directive {
+  ArrayRef<Token> Tokens;
+
   /// The kind of token.
   DirectiveKind Kind = pp_none;
 
-  /// Offset into the output byte stream of where the directive begins.
-  int Offset = -1;
-
-  Directive(DirectiveKind K, int Offset) : Kind(K), Offset(Offset) {}
+  Directive() = default;
+  Directive(DirectiveKind K, ArrayRef<Token> Tokens)
+      : Tokens(Tokens), Kind(K) {}
 };
 
 } // end namespace dependency_directives_scan
 
-/// Minimize the input down to the preprocessor directives that might have
+/// Scan the input for the preprocessor directives that might have
 /// an effect on the dependencies for a compilation unit.
 ///
-/// This function deletes all non-preprocessor code, and strips anything that
-/// can't affect what gets included. It canonicalizes whitespace where
-/// convenient to stabilize the output against formatting changes in the input.
-///
-/// Clears the output vectors at the beginning of the call.
+/// This function ignores all non-preprocessor code and anything that
+/// can't affect what gets included.
 ///
 /// \returns false on success, true on error. If the diagnostic engine is not
 /// null, an appropriate error is reported using the given input location
-/// with the offset that corresponds to the minimizer's current buffer offset.
+/// with the offset that corresponds to the \p Input buffer offset.
 bool scanSourceForDependencyDirectives(
-    llvm::StringRef Input, llvm::SmallVectorImpl<char> &Output,
-    llvm::SmallVectorImpl<dependency_directives_scan::Directive> &Directives,
+    StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
+    SmallVectorImpl<dependency_directives_scan::Directive> &Directives,
     DiagnosticsEngine *Diags = nullptr,
     SourceLocation InputSourceLoc = SourceLocation());
 
+/// Print the previously scanned dependency directives as minimized source text.
+///
+/// \param Source The original source text that the dependency directives were
+/// scanned from.
+/// \param Directives The previously scanned dependency
+/// directives.
+/// \param OS the stream to print the dependency directives on.
+///
+/// This is used primarily for testing purposes, during dependency scanning the
+/// \p Lexer uses the tokens directly, not their printed version.
+void printDependencyDirectivesAsSource(
+    StringRef Source,
+    ArrayRef<dependency_directives_scan::Directive> Directives,
+    llvm::raw_ostream &OS);
+
 } // end namespace clang
 
 #endif // LLVM_CLANG_LEX_DEPENDENCYDIRECTIVESSCANNER_H

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D125487: [Tooling/DependencyScanning] Refactor dependency scanning to produce pre-lexed preprocessor directive tokens, instead of minimized sources

Reply via email to