[PATCH] D125487: [Tooling/DependencyScanning] Refactor dependency scanning to produce pre-lexed preprocessor directive tokens, instead of minimized sources

Argyrios Kyrtzidis via Phabricator via cfe-commits Mon, 16 May 2022 10:27:57 -0700

akyrtzi updated this revision to Diff 429766.
akyrtzi added a comment.

Make sure to enable line comments for dependency directive lexing.



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D125487/new/

https://reviews.llvm.org/D125487

Files:
  clang/include/clang/Lex/DependencyDirectivesScanner.h
  clang/include/clang/Lex/Lexer.h
  clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
  clang/lib/Frontend/FrontendActions.cpp
  clang/lib/Lex/DependencyDirectivesScanner.cpp
  clang/lib/Lex/Lexer.cpp
  clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
  clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c
  clang/test/Lexer/minimize_source_to_dependency_directives_pragmas.c
  clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
  clang/unittests/Tooling/DependencyScannerTest.cpp

Index: clang/unittests/Tooling/DependencyScannerTest.cpp
===================================================================
--- clang/unittests/Tooling/DependencyScannerTest.cpp
+++ clang/unittests/Tooling/DependencyScannerTest.cpp
@@ -204,51 +204,5 @@
   EXPECT_EQ(convert_to_slash(Deps[5]), "/root/symlink.h");
 }
 
-namespace dependencies {
-TEST(DependencyScanningFilesystem, IgnoredFilesAreCachedSeparately1) {
-  auto VFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
-  VFS->addFile("/mod.h", 0,
-               llvm::MemoryBuffer::getMemBuffer("#include <foo.h>\n"
-                                                "// hi there!\n"));
-
-  DependencyScanningFilesystemSharedCache SharedCache;
-  DependencyScanningWorkerFilesystem DepFS(SharedCache, VFS);
-
-  DepFS.enableDirectivesScanningOfAllFiles(); // Let's be explicit for clarity.
-  auto StatusMinimized0 = DepFS.status("/mod.h");
-  DepFS.disableDirectivesScanning("/mod.h");
-  auto StatusFull1 = DepFS.status("/mod.h");
-
-  EXPECT_TRUE(StatusMinimized0);
-  EXPECT_TRUE(StatusFull1);
-  EXPECT_EQ(StatusMinimized0->getSize(), 17u);
-  EXPECT_EQ(StatusFull1->getSize(), 30u);
-  EXPECT_EQ(StatusMinimized0->getName(), StringRef("/mod.h"));
-  EXPECT_EQ(StatusFull1->getName(), StringRef("/mod.h"));
-}
-
-TEST(DependencyScanningFilesystem, IgnoredFilesAreCachedSeparately2) {
-  auto VFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
-  VFS->addFile("/mod.h", 0,
-               llvm::MemoryBuffer::getMemBuffer("#include <foo.h>\n"
-                                                "// hi there!\n"));
-
-  DependencyScanningFilesystemSharedCache SharedCache;
-  DependencyScanningWorkerFilesystem DepFS(SharedCache, VFS);
-
-  DepFS.disableDirectivesScanning("/mod.h");
-  auto StatusFull0 = DepFS.status("/mod.h");
-  DepFS.enableDirectivesScanningOfAllFiles();
-  auto StatusMinimized1 = DepFS.status("/mod.h");
-
-  EXPECT_TRUE(StatusFull0);
-  EXPECT_TRUE(StatusMinimized1);
-  EXPECT_EQ(StatusFull0->getSize(), 30u);
-  EXPECT_EQ(StatusMinimized1->getSize(), 17u);
-  EXPECT_EQ(StatusFull0->getName(), StringRef("/mod.h"));
-  EXPECT_EQ(StatusMinimized1->getName(), StringRef("/mod.h"));
-}
-
-} // end namespace dependencies
 } // end namespace tooling
 } // end namespace clang
Index: clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
===================================================================
--- clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
+++ clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
@@ -14,39 +14,58 @@
 using namespace clang;
 using namespace clang::dependency_directives_scan;
 
-static bool minimizeSourceToDependencyDirectives(StringRef Input,
-                                                 SmallVectorImpl<char> &Out) {
-  SmallVector<dependency_directives_scan::Directive, 32> Directives;
-  return scanSourceForDependencyDirectives(Input, Out, Directives);
+static bool minimizeSourceToDependencyDirectives(
+    StringRef Input, SmallVectorImpl<char> &Out,
+    SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
+    SmallVectorImpl<Directive> &Directives) {
+  Out.clear();
+  Tokens.clear();
+  Directives.clear();
+  if (scanSourceForDependencyDirectives(Input, Tokens, Directives))
+    return true;
+
+  raw_svector_ostream OS(Out);
+  printDependencyDirectivesAsSource(Input, Directives, OS);
+  if (!Out.empty() && Out.back() != '\n')
+    Out.push_back('\n');
+  Out.push_back('\0');
+  Out.pop_back();
+
+  return false;
 }
 
-static bool
-minimizeSourceToDependencyDirectives(StringRef Input,
-                                     SmallVectorImpl<char> &Out,
-                                     SmallVectorImpl<Directive> &Directives) {
-  return scanSourceForDependencyDirectives(Input, Out, Directives);
+static bool minimizeSourceToDependencyDirectives(StringRef Input,
+                                                 SmallVectorImpl<char> &Out) {
+  SmallVector<dependency_directives_scan::Token, 16> Tokens;
+  SmallVector<Directive, 32> Directives;
+  return minimizeSourceToDependencyDirectives(Input, Out, Tokens, Directives);
 }
 
 namespace {
 
 TEST(MinimizeSourceToDependencyDirectivesTest, Empty) {
   SmallVector<char, 128> Out;
+  SmallVector<dependency_directives_scan::Token, 4> Tokens;
   SmallVector<Directive, 4> Directives;
 
-  ASSERT_FALSE(minimizeSourceToDependencyDirectives("", Out, Directives));
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("", Out, Tokens, Directives));
   EXPECT_TRUE(Out.empty());
+  EXPECT_TRUE(Tokens.empty());
   ASSERT_EQ(1u, Directives.size());
   ASSERT_EQ(pp_eof, Directives.back().Kind);
 
-  ASSERT_FALSE(
-      minimizeSourceToDependencyDirectives("abc def\nxyz", Out, Directives));
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("abc def\nxyz", Out, Tokens,
+                                                    Directives));
   EXPECT_TRUE(Out.empty());
+  EXPECT_TRUE(Tokens.empty());
   ASSERT_EQ(1u, Directives.size());
   ASSERT_EQ(pp_eof, Directives.back().Kind);
 }
 
-TEST(MinimizeSourceToDependencyDirectivesTest, AllDirectives) {
+TEST(MinimizeSourceToDependencyDirectivesTest, AllTokens) {
   SmallVector<char, 128> Out;
+  SmallVector<dependency_directives_scan::Token, 4> Tokens;
   SmallVector<Directive, 4> Directives;
 
   ASSERT_FALSE(
@@ -71,7 +90,7 @@
                                            "#pragma include_alias(<A>, <B>)\n"
                                            "export module m;\n"
                                            "import m;\n",
-                                           Out, Directives));
+                                           Out, Tokens, Directives));
   EXPECT_EQ(pp_define, Directives[0].Kind);
   EXPECT_EQ(pp_undef, Directives[1].Kind);
   EXPECT_EQ(pp_endif, Directives[2].Kind);
@@ -91,19 +110,28 @@
   EXPECT_EQ(pp_pragma_push_macro, Directives[16].Kind);
   EXPECT_EQ(pp_pragma_pop_macro, Directives[17].Kind);
   EXPECT_EQ(pp_pragma_include_alias, Directives[18].Kind);
-  EXPECT_EQ(cxx_export_decl, Directives[19].Kind);
-  EXPECT_EQ(cxx_module_decl, Directives[20].Kind);
-  EXPECT_EQ(cxx_import_decl, Directives[21].Kind);
-  EXPECT_EQ(pp_eof, Directives[22].Kind);
+  EXPECT_EQ(cxx_export_module_decl, Directives[19].Kind);
+  EXPECT_EQ(cxx_import_decl, Directives[20].Kind);
+  EXPECT_EQ(pp_eof, Directives[21].Kind);
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, EmptyHash) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#\n#define MACRO a\n", Out));
+  EXPECT_STREQ("#define MACRO a\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, Define) {
   SmallVector<char, 128> Out;
+  SmallVector<dependency_directives_scan::Token, 4> Tokens;
   SmallVector<Directive, 4> Directives;
 
-  ASSERT_FALSE(
-      minimizeSourceToDependencyDirectives("#define MACRO", Out, Directives));
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO", Out,
+                                                    Tokens, Directives));
   EXPECT_STREQ("#define MACRO\n", Out.data());
+  ASSERT_EQ(4u, Tokens.size());
   ASSERT_EQ(2u, Directives.size());
   ASSERT_EQ(pp_define, Directives.front().Kind);
 }
@@ -144,25 +172,25 @@
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       "#define MACRO   con  tent   ", Out));
-  EXPECT_STREQ("#define MACRO con  tent\n", Out.data());
+  EXPECT_STREQ("#define MACRO con tent\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       "#define MACRO()   con  tent   ", Out));
-  EXPECT_STREQ("#define MACRO() con  tent\n", Out.data());
+  EXPECT_STREQ("#define MACRO() con tent\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, DefineInvalidMacroArguments) {
   SmallVector<char, 128> Out;
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO((a))", Out));
-  EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data());
+  EXPECT_STREQ("#define MACRO((a))\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO(", Out));
-  EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data());
+  EXPECT_STREQ("#define MACRO(\n", Out.data());
 
   ASSERT_FALSE(
       minimizeSourceToDependencyDirectives("#define MACRO(a * b)", Out));
-  EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data());
+  EXPECT_STREQ("#define MACRO(a*b)\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, DefineHorizontalWhitespace) {
@@ -170,19 +198,19 @@
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       "#define MACRO(\t)\tcon \t tent\t", Out));
-  EXPECT_STREQ("#define MACRO() con \t tent\n", Out.data());
+  EXPECT_STREQ("#define MACRO() con tent\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       "#define MACRO(\f)\fcon \f tent\f", Out));
-  EXPECT_STREQ("#define MACRO() con \f tent\n", Out.data());
+  EXPECT_STREQ("#define MACRO() con tent\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       "#define MACRO(\v)\vcon \v tent\v", Out));
-  EXPECT_STREQ("#define MACRO() con \v tent\n", Out.data());
+  EXPECT_STREQ("#define MACRO() con tent\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       "#define MACRO \t\v\f\v\t con\f\t\vtent\v\f \v", Out));
-  EXPECT_STREQ("#define MACRO con\f\t\vtent\n", Out.data());
+  EXPECT_STREQ("#define MACRO con tent\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, DefineMultilineArgs) {
@@ -255,25 +283,27 @@
 TEST(MinimizeSourceToDependencyDirectivesTest, DefineNumber) {
   SmallVector<char, 128> Out;
 
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("#define 0\n", Out));
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define 0\n", Out));
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoName) {
   SmallVector<char, 128> Out;
 
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("#define &\n", Out));
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define &\n", Out));
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoWhitespace) {
   SmallVector<char, 128> Out;
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND&\n", Out));
-  EXPECT_STREQ("#define AND &\n", Out.data());
+  EXPECT_STREQ("#define AND&\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND\\\n"
                                                     "&\n",
                                                     Out));
-  EXPECT_STREQ("#define AND &\n", Out.data());
+  EXPECT_STREQ("#define AND\\\n"
+               "&\n",
+               Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, MultilineComment) {
@@ -303,6 +333,14 @@
                Out.data());
 }
 
+TEST(MinimizeSourceToDependencyDirectivesTest, CommentSlashSlashStar) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      "#define MACRO 1 //* blah */\n", Out));
+  EXPECT_STREQ("#define MACRO 1\n", Out.data());
+}
+
 TEST(MinimizeSourceToDependencyDirectivesTest, Ifdef) {
   SmallVector<char, 128> Out;
 
@@ -481,6 +519,9 @@
   ASSERT_FALSE(
       minimizeSourceToDependencyDirectives("#__include_macros <A>\n", Out));
   EXPECT_STREQ("#__include_macros <A>\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#include MACRO\n", Out));
+  EXPECT_STREQ("#include MACRO\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, AtImport) {
@@ -507,8 +548,9 @@
   SmallVector<char, 128> Out;
 
   ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import A\n", Out));
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import MACRO(A);\n", Out));
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import \" \";\n", Out));
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("@import MACRO(A);\n", Out));
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import \" \";\n", Out));
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, RawStringLiteral) {
@@ -559,7 +601,8 @@
                                                     "#define GUARD\n"
                                                     "#endif\n",
                                                     Out));
-  EXPECT_STREQ("#ifndef GUARD\n"
+  EXPECT_STREQ("#if\\\n"
+               "ndef GUARD\n"
                "#define GUARD\n"
                "#endif\n",
                Out.data());
@@ -567,12 +610,16 @@
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\n"
                                                     "RD\n",
                                                     Out));
-  EXPECT_STREQ("#define GUARD\n", Out.data());
+  EXPECT_STREQ("#define GUA\\\n"
+               "RD\n",
+               Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\r"
                                                     "RD\n",
                                                     Out));
-  EXPECT_STREQ("#define GUARD\n", Out.data());
+  EXPECT_STREQ("#define GUA\\\r"
+               "RD\n",
+               Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\n"
                                                     "           RD\n",
@@ -588,7 +635,10 @@
                                                     "2 + \\\t\n"
                                                     "3\n",
                                                     Out));
-  EXPECT_STREQ("#define A 1 + 2 + 3\n", Out.data());
+  EXPECT_STREQ("#define A 1+\\  \n"
+               "2+\\\t\n"
+               "3\n",
+               Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, PoundWarningAndError) {
@@ -682,6 +732,7 @@
 
 TEST(MinimizeSourceToDependencyDirectivesTest, PragmaOnce) {
   SmallVector<char, 128> Out;
+  SmallVector<dependency_directives_scan::Token, 4> Tokens;
   SmallVector<Directive, 4> Directives;
 
   StringRef Source = R"(// comment
@@ -689,7 +740,8 @@
 // another comment
 #include <test.h>
 )";
-  ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out, Directives));
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives(Source, Out, Tokens, Directives));
   EXPECT_STREQ("#pragma once\n#include <test.h>\n", Out.data());
   ASSERT_EQ(Directives.size(), 3u);
   EXPECT_EQ(Directives[0].Kind, dependency_directives_scan::pp_pragma_once);
@@ -700,7 +752,7 @@
     #include <test.h>
     )";
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
-  EXPECT_STREQ("#pragma once\n#include <test.h>\n", Out.data());
+  EXPECT_STREQ("#pragma once extra tokens\n#include <test.h>\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest,
@@ -755,11 +807,12 @@
 
   Source = "#define X \"\\ \r\nx\n#include <x>\n";
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
-  EXPECT_STREQ("#define X \"\\ \r\nx\n#include <x>\n", Out.data());
+  EXPECT_STREQ("#define X\"\\ \r\nx\n#include <x>\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, CxxModules) {
   SmallVector<char, 128> Out;
+  SmallVector<dependency_directives_scan::Token, 4> Tokens;
   SmallVector<Directive, 4> Directives;
 
   StringRef Source = R"(
@@ -789,16 +842,17 @@
       import f(->a = 3);
     }
     )";
-  ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out, Directives));
-  EXPECT_STREQ("#include \"textual-header.h\"\nexport module m;\n"
-               "export import :l [[rename]];\n"
-               "import <<= 3;\nimport a b d e d e f e;\n"
-               "import foo [[no_unique_address]];\nimport foo();\n"
-               "import f(:sefse);\nimport f(->a = 3);\n",
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives(Source, Out, Tokens, Directives));
+  EXPECT_STREQ("#include \"textual-header.h\"\nexport module m;"
+               "exp\\\nort import:l[[rename]];"
+               "import<<=3;import a b d e d e f e;"
+               "import foo[[no_unique_address]];import foo();"
+               "import f(:sefse);import f(->a=3);\n",
                Out.data());
-  ASSERT_EQ(Directives.size(), 12u);
-  EXPECT_EQ(Directives[0].Kind, dependency_directives_scan::pp_include);
-  EXPECT_EQ(Directives[2].Kind, dependency_directives_scan::cxx_module_decl);
+  ASSERT_EQ(Directives.size(), 10u);
+  EXPECT_EQ(Directives[0].Kind, pp_include);
+  EXPECT_EQ(Directives[1].Kind, cxx_export_module_decl);
 }
 
 } // end anonymous namespace
Index: clang/test/Lexer/minimize_source_to_dependency_directives_pragmas.c
===================================================================
--- clang/test/Lexer/minimize_source_to_dependency_directives_pragmas.c
+++ clang/test/Lexer/minimize_source_to_dependency_directives_pragmas.c
@@ -15,7 +15,7 @@
 #pragma include_alias(<string>,   "mystring.h")
 
 // CHECK:      #pragma once
-// CHECK-NEXT: #pragma push_macro( "MYMACRO" )
+// CHECK-NEXT: #pragma push_macro("MYMACRO")
 // CHECK-NEXT: #pragma pop_macro("MYMACRO")
 // CHECK-NEXT: #pragma clang module import mymodule
 // CHECK-NEXT: #pragma include_alias(<string>, "mystring.h")
Index: clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c
===================================================================
--- clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c
+++ clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c
@@ -1,3 +1,4 @@
-// RUN: %clang_cc1 -verify -print-dependency-directives-minimized-source %s 2>&1
+// RUN: %clang_cc1 -print-dependency-directives-minimized-source %s 2>&1 | FileCheck %s
 
-#define 0 0 // expected-error {{macro name must be an identifier}}
+#define 0 0
+// CHECK: #define 0 0
Index: clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
===================================================================
--- clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
+++ clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h"
-#include "clang/Lex/DependencyDirectivesScanner.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include "llvm/Support/Threading.h"
@@ -45,47 +44,40 @@
     const CachedFileSystemEntry &Entry, StringRef Filename, bool Disable) {
   if (Entry.isError() || Entry.isDirectory() || Disable ||
       !shouldScanForDirectives(Filename, Entry.getUniqueID()))
-    return EntryRef(/*Minimized=*/false, Filename, Entry);
+    return EntryRef(Filename, Entry);
 
   CachedFileContents *Contents = Entry.getCachedContents();
   assert(Contents && "contents not initialized");
 
   // Double-checked locking.
-  if (Contents->MinimizedAccess.load())
-    return EntryRef(/*Minimized=*/true, Filename, Entry);
+  if (Contents->DepDirectives.load())
+    return EntryRef(Filename, Entry);
 
   std::lock_guard<std::mutex> GuardLock(Contents->ValueLock);
 
   // Double-checked locking.
-  if (Contents->MinimizedAccess.load())
-    return EntryRef(/*Minimized=*/true, Filename, Entry);
+  if (Contents->DepDirectives.load())
+    return EntryRef(Filename, Entry);
 
-  llvm::SmallString<1024> MinimizedFileContents;
-  // Minimize the file down to directives that might affect the dependencies.
-  SmallVector<dependency_directives_scan::Directive, 64> Tokens;
+  SmallVector<dependency_directives_scan::Directive, 64> Directives;
+  // Scan the file for preprocessor directives that might affect the
+  // dependencies.
   if (scanSourceForDependencyDirectives(Contents->Original->getBuffer(),
-                                        MinimizedFileContents, Tokens)) {
+                                        Contents->DepDirectiveTokens,
+                                        Directives)) {
+    Contents->DepDirectiveTokens.clear();
     // FIXME: Propagate the diagnostic if desired by the client.
-    // Use the original file if the minimization failed.
-    Contents->MinimizedStorage =
-        llvm::MemoryBuffer::getMemBuffer(*Contents->Original);
-    Contents->MinimizedAccess.store(Contents->MinimizedStorage.get());
-    return EntryRef(/*Minimized=*/true, Filename, Entry);
+    Contents->DepDirectives.store(new Optional<DependencyDirectivesTy>());
+    return EntryRef(Filename, Entry);
   }
 
-  // The contents produced by the minimizer must be null terminated.
-  assert(MinimizedFileContents.data()[MinimizedFileContents.size()] == '\0' &&
-         "not null terminated contents");
-
-  Contents->MinimizedStorage = std::make_unique<llvm::SmallVectorMemoryBuffer>(
-      std::move(MinimizedFileContents));
-  // This function performed double-checked locking using `MinimizedAccess`.
-  // Assigning it must be the last thing this function does. If we were to
-  // assign it before `PPSkippedRangeMapping`, other threads may skip the
-  // critical section (`MinimizedAccess != nullptr`) and access the mappings
-  // that are about to be initialized, leading to a data race.
-  Contents->MinimizedAccess.store(Contents->MinimizedStorage.get());
-  return EntryRef(/*Minimized=*/true, Filename, Entry);
+  // This function performed double-checked locking using `DepDirectives`.
+  // Assigning it must be the last thing this function does, otherwise other
+  // threads may skip the
+  // critical section (`DepDirectives != nullptr`), leading to a data race.
+  Contents->DepDirectives.store(
+      new Optional<DependencyDirectivesTy>(std::move(Directives)));
+  return EntryRef(Filename, Entry);
 }
 
 DependencyScanningFilesystemSharedCache::
Index: clang/lib/Lex/Lexer.cpp
===================================================================
--- clang/lib/Lex/Lexer.cpp
+++ clang/lib/Lex/Lexer.cpp
@@ -226,13 +226,11 @@
   return L;
 }
 
-bool Lexer::skipOver(unsigned NumBytes) {
-  IsAtPhysicalStartOfLine = true;
-  IsAtStartOfLine = true;
-  if ((BufferPtr + NumBytes) > BufferEnd)
-    return true;
-  BufferPtr += NumBytes;
-  return false;
+void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
+  this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
+  this->IsAtStartOfLine = IsAtStartOfLine;
+  assert((BufferStart + Offset) <= BufferEnd);
+  BufferPtr = BufferStart + Offset;
 }
 
 template <typename T> static void StringifyImpl(T &Str, char Quote) {
Index: clang/lib/Lex/DependencyDirectivesScanner.cpp
===================================================================
--- clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -18,83 +18,127 @@
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Lex/LexDiagnostic.h"
+#include "clang/Lex/Lexer.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/MemoryBuffer.h"
 
-using namespace llvm;
 using namespace clang;
 using namespace clang::dependency_directives_scan;
+using namespace llvm;
 
 namespace {
 
-struct Scanner {
-  /// Minimized output.
-  SmallVectorImpl<char> &Out;
-  /// The known tokens encountered during the minimization.
-  SmallVectorImpl<Directive> &Directives;
+struct DirectiveWithTokens {
+  DirectiveKind Kind;
+  unsigned NumTokens;
 
-  Scanner(SmallVectorImpl<char> &Out, SmallVectorImpl<Directive> &Directives,
-          StringRef Input, DiagnosticsEngine *Diags,
-          SourceLocation InputSourceLoc)
-      : Out(Out), Directives(Directives), Input(Input), Diags(Diags),
-        InputSourceLoc(InputSourceLoc) {}
+  DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens)
+      : Kind(Kind), NumTokens(NumTokens) {}
+};
+
+/// Does an efficient "scan" of the sources to detect the presence of
+/// preprocessor (or module import) directives and collects the raw lexed tokens
+/// for those directives so that the \p Lexer can "replay" them when the file is
+/// included.
+///
+/// Note that the behavior of the raw lexer is affected by the language mode,
+/// while at this point we want to do a scan and collect tokens once,
+/// irrespective of the language mode that the file will get included in. To
+/// compensate for that the \p Lexer, while "replaying", will adjust a token
+/// where appropriate, when it could affect the preprocessor's state.
+/// For example in a directive like
+///
+/// \code
+///   #if __has_cpp_attribute(clang::fallthrough)
+/// \endcode
+///
+/// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2
+/// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon'
+/// while in C++ mode.
+struct Scanner {
+  Scanner(StringRef Input,
+          SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
+          DiagnosticsEngine *Diags, SourceLocation InputSourceLoc)
+      : Input(Input), Tokens(Tokens), Diags(Diags),
+        InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()),
+        TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(),
+                 Input.end()) {}
+
+  static LangOptions getLangOptsForDepScanning() {
+    LangOptions LangOpts;
+    // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
+    LangOpts.ObjC = true;
+    LangOpts.LineComment = true;
+    return LangOpts;
+  }
 
   /// Lex the provided source and emit the directive tokens.
   ///
   /// \returns True on error.
-  bool scan();
+  bool scan(SmallVectorImpl<Directive> &Directives);
 
 private:
-  struct IdInfo {
-    const char *Last;
-    StringRef Name;
-  };
+  /// Lexes next token and advances \p First and the \p Lexer.
+  LLVM_NODISCARD dependency_directives_scan::Token &
+  lexToken(const char *&First, const char *const End);
 
-  /// Lex an identifier.
+  dependency_directives_scan::Token &lexIncludeFilename(const char *&First,
+                                                        const char *const End);
+
+  /// Lexes next token and if it is identifier returns its string, otherwise
+  /// it skips the current line and returns \p None.
   ///
-  /// \pre First points at a valid identifier head.
-  LLVM_NODISCARD IdInfo lexIdentifier(const char *First, const char *const End);
-  LLVM_NODISCARD bool isNextIdentifier(StringRef Id, const char *&First,
-                                       const char *const End);
+  /// In any case (whatever the token kind) \p First and the \p Lexer will
+  /// advance beyond the token.
+  LLVM_NODISCARD Optional<StringRef>
+  tryLexIdentifierOrSkipLine(const char *&First, const char *const End);
+
+  /// Used when it is certain that next token is an identifier.
+  LLVM_NODISCARD StringRef lexIdentifier(const char *&First,
+                                         const char *const End);
+
+  /// Lexes next token and returns true iff it is an identifier that matches \p
+  /// Id, otherwise it skips the current line and returns false.
+  ///
+  /// In any case (whatever the token kind) \p First and the \p Lexer will
+  /// advance beyond the token.
+  LLVM_NODISCARD bool isNextIdentifierOrSkipLine(StringRef Id,
+                                                 const char *&First,
+                                                 const char *const End);
+
   LLVM_NODISCARD bool scanImpl(const char *First, const char *const End);
   LLVM_NODISCARD bool lexPPLine(const char *&First, const char *const End);
   LLVM_NODISCARD bool lexAt(const char *&First, const char *const End);
   LLVM_NODISCARD bool lexModule(const char *&First, const char *const End);
-  LLVM_NODISCARD bool lexDefine(const char *&First, const char *const End);
+  LLVM_NODISCARD bool lexDefine(const char *HashLoc, const char *&First,
+                                const char *const End);
   LLVM_NODISCARD bool lexPragma(const char *&First, const char *const End);
   LLVM_NODISCARD bool lexEndif(const char *&First, const char *const End);
-  LLVM_NODISCARD bool lexDefault(DirectiveKind Kind, StringRef Directive,
-                                 const char *&First, const char *const End);
-  Directive &pushDirective(DirectiveKind K) {
-    Directives.emplace_back(K, Out.size());
-    return Directives.back();
+  LLVM_NODISCARD bool lexDefault(DirectiveKind Kind, const char *&First,
+                                 const char *const End);
+  LLVM_NODISCARD bool lexModuleDirectiveBody(DirectiveKind Kind,
+                                             const char *&First,
+                                             const char *const End);
+  void lexPPDirectiveBody(const char *&First, const char *const End);
+
+  DirectiveWithTokens &pushDirective(DirectiveKind Kind) {
+    Tokens.append(CurDirToks);
+    DirsWithToks.emplace_back(Kind, CurDirToks.size());
+    CurDirToks.clear();
+    return DirsWithToks.back();
   }
   void popDirective() {
-    Out.resize(Directives.back().Offset);
-    Directives.pop_back();
+    Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens);
   }
   DirectiveKind topDirective() const {
-    return Directives.empty() ? pp_none : Directives.back().Kind;
+    return DirsWithToks.empty() ? pp_none : DirsWithToks.back().Kind;
   }
 
-  Scanner &put(char Byte) {
-    Out.push_back(Byte);
-    return *this;
+  unsigned getOffsetAt(const char *CurPtr) const {
+    return CurPtr - Input.data();
   }
-  Scanner &append(StringRef S) { return append(S.begin(), S.end()); }
-  Scanner &append(const char *First, const char *Last) {
-    Out.append(First, Last);
-    return *this;
-  }
-
-  void printToNewline(const char *&First, const char *const End);
-  void printAdjacentModuleNameParts(const char *&First, const char *const End);
-  LLVM_NODISCARD bool printAtImportBody(const char *&First,
-                                        const char *const End);
-  void printDirectiveBody(const char *&First, const char *const End);
-  void printAdjacentMacroArgs(const char *&First, const char *const End);
-  LLVM_NODISCARD bool printMacroArgs(const char *&First, const char *const End);
 
   /// Reports a diagnostic if the diagnostic engine is provided. Always returns
   /// true at the end.
@@ -102,8 +146,14 @@
 
   StringMap<char> SplitIds;
   StringRef Input;
+  SmallVectorImpl<dependency_directives_scan::Token> &Tokens;
   DiagnosticsEngine *Diags;
   SourceLocation InputSourceLoc;
+
+  SmallVector<dependency_directives_scan::Token, 32> CurDirToks;
+  SmallVector<DirectiveWithTokens, 64> DirsWithToks;
+  LangOptions LangOpts;
+  Lexer TheLexer;
 };
 
 } // end anonymous namespace
@@ -112,7 +162,7 @@
   if (!Diags)
     return true;
   assert(CurPtr >= Input.data() && "invalid buffer ptr");
-  Diags->Report(InputSourceLoc.getLocWithOffset(CurPtr - Input.data()), Err);
+  Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err);
   return true;
 }
 
@@ -265,30 +315,6 @@
   }
 }
 
-static const char *findLastNonSpace(const char *First, const char *Last) {
-  assert(First <= Last);
-  while (First != Last && isHorizontalWhitespace(Last[-1]))
-    --Last;
-  return Last;
-}
-
-static const char *findLastNonSpaceNonBackslash(const char *First,
-                                                const char *Last) {
-  assert(First <= Last);
-  while (First != Last &&
-         (isHorizontalWhitespace(Last[-1]) || Last[-1] == '\\'))
-    --Last;
-  return Last;
-}
-
-static const char *findFirstTrailingSpace(const char *First, const char *Last) {
-  const char *LastNonSpace = findLastNonSpace(First, Last);
-  if (Last == LastNonSpace)
-    return Last;
-  assert(isHorizontalWhitespace(LastNonSpace[0]));
-  return LastNonSpace + 1;
-}
-
 static void skipLineComment(const char *&First, const char *const End) {
   assert(First[0] == '/' && First[1] == '/');
   First += 2;
@@ -396,67 +422,6 @@
     skipLine(First, End);
 }
 
-void Scanner::printToNewline(const char *&First, const char *const End) {
-  while (First != End && !isVerticalWhitespace(*First)) {
-    const char *Last = First;
-    do {
-      // Iterate over strings correctly to avoid comments and newlines.
-      if (*Last == '"' || *Last == '\'' ||
-          (*Last == '<' &&
-           (topDirective() == pp_include || topDirective() == pp_import))) {
-        if (LLVM_UNLIKELY(isRawStringLiteral(First, Last)))
-          skipRawString(Last, End);
-        else
-          skipString(Last, End);
-        continue;
-      }
-      if (*Last != '/' || End - Last < 2) {
-        ++Last;
-        continue; // Gather the rest up to print verbatim.
-      }
-
-      if (Last[1] != '/' && Last[1] != '*') {
-        ++Last;
-        continue;
-      }
-
-      // Deal with "//..." and "/*...*/".
-      append(First, findFirstTrailingSpace(First, Last));
-      First = Last;
-
-      if (Last[1] == '/') {
-        skipLineComment(First, End);
-        return;
-      }
-
-      put(' ');
-      skipBlockComment(First, End);
-      skipOverSpaces(First, End);
-      Last = First;
-    } while (Last != End && !isVerticalWhitespace(*Last));
-
-    // Print out the string.
-    const char *LastBeforeTrailingSpace = findLastNonSpace(First, Last);
-    if (Last == End || LastBeforeTrailingSpace == First ||
-        LastBeforeTrailingSpace[-1] != '\\') {
-      append(First, LastBeforeTrailingSpace);
-      First = Last;
-      skipNewline(First, End);
-      return;
-    }
-
-    // Print up to the last character that's not a whitespace or backslash.
-    // Then print exactly one space, which matters when tokens are separated by
-    // a line continuation.
-    append(First, findLastNonSpaceNonBackslash(First, Last));
-    put(' ');
-
-    First = Last;
-    skipNewline(First, End);
-    skipOverSpaces(First, End);
-  }
-}
-
 static void skipWhitespace(const char *&First, const char *const End) {
   for (;;) {
     assert(First <= End);
@@ -489,176 +454,134 @@
   }
 }
 
-void Scanner::printAdjacentModuleNameParts(const char *&First,
-                                           const char *const End) {
-  // Skip over parts of the body.
-  const char *Last = First;
-  do
-    ++Last;
-  while (Last != End && (isAsciiIdentifierContinue(*Last) || *Last == '.'));
-  append(First, Last);
-  First = Last;
-}
-
-bool Scanner::printAtImportBody(const char *&First, const char *const End) {
+bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First,
+                                     const char *const End) {
+  const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset;
   for (;;) {
-    skipWhitespace(First, End);
-    if (First == End)
-      return true;
-
-    if (isVerticalWhitespace(*First)) {
-      skipNewline(First, End);
-      continue;
-    }
-
-    // Found a semicolon.
-    if (*First == ';') {
-      put(*First++).put('\n');
-      return false;
-    }
-
-    // Don't handle macro expansions inside @import for now.
-    if (!isAsciiIdentifierContinue(*First) && *First != '.')
-      return true;
-
-    printAdjacentModuleNameParts(First, End);
+    const dependency_directives_scan::Token &Tok = lexToken(First, End);
+    if (Tok.is(tok::eof))
+      return reportError(
+          DirectiveLoc,
+          diag::err_dep_source_scanner_missing_semi_after_at_import);
+    if (Tok.is(tok::semi))
+      break;
   }
+  pushDirective(Kind);
+  skipWhitespace(First, End);
+  if (First == End)
+    return false;
+  if (!isVerticalWhitespace(*First))
+    return reportError(
+        DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import);
+  skipNewline(First, End);
+  return false;
 }
 
-void Scanner::printDirectiveBody(const char *&First, const char *const End) {
-  skipWhitespace(First, End); // Skip initial whitespace.
-  printToNewline(First, End);
-  while (Out.back() == ' ')
-    Out.pop_back();
-  put('\n');
-}
+dependency_directives_scan::Token &Scanner::lexToken(const char *&First,
+                                                     const char *const End) {
+  clang::Token Tok;
+  TheLexer.LexFromRawLexer(Tok);
+  First = Input.data() + TheLexer.getCurrentBufferOffset();
+  assert(First <= End);
 
-LLVM_NODISCARD static const char *lexRawIdentifier(const char *First,
-                                                   const char *const End) {
-  assert(isAsciiIdentifierContinue(*First) && "invalid identifer");
-  const char *Last = First + 1;
-  while (Last != End && isAsciiIdentifierContinue(*Last))
-    ++Last;
-  return Last;
+  unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
+  CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
+                          Tok.getFlags());
+  return CurDirToks.back();
 }
 
-LLVM_NODISCARD static const char *
-getIdentifierContinuation(const char *First, const char *const End) {
-  if (End - First < 3 || First[0] != '\\' || !isVerticalWhitespace(First[1]))
-    return nullptr;
+dependency_directives_scan::Token &
+Scanner::lexIncludeFilename(const char *&First, const char *const End) {
+  clang::Token Tok;
+  TheLexer.LexIncludeFilename(Tok);
+  First = Input.data() + TheLexer.getCurrentBufferOffset();
+  assert(First <= End);
 
-  ++First;
-  skipNewline(First, End);
-  if (First == End)
-    return nullptr;
-  return isAsciiIdentifierContinue(First[0]) ? First : nullptr;
-}
-
-Scanner::IdInfo Scanner::lexIdentifier(const char *First,
-                                       const char *const End) {
-  const char *Last = lexRawIdentifier(First, End);
-  const char *Next = getIdentifierContinuation(Last, End);
-  if (LLVM_LIKELY(!Next))
-    return IdInfo{Last, StringRef(First, Last - First)};
-
-  // Slow path, where identifiers are split over lines.
-  SmallVector<char, 64> Id(First, Last);
-  while (Next) {
-    Last = lexRawIdentifier(Next, End);
-    Id.append(Next, Last);
-    Next = getIdentifierContinuation(Last, End);
-  }
-  return IdInfo{
-      Last,
-      SplitIds.try_emplace(StringRef(Id.begin(), Id.size()), 0).first->first()};
+  unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
+  CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
+                          Tok.getFlags());
+  return CurDirToks.back();
 }
 
-void Scanner::printAdjacentMacroArgs(const char *&First,
-                                     const char *const End) {
-  // Skip over parts of the body.
-  const char *Last = First;
-  do
-    ++Last;
-  while (Last != End &&
-         (isAsciiIdentifierContinue(*Last) || *Last == '.' || *Last == ','));
-  append(First, Last);
-  First = Last;
+void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
+  while (true) {
+    const dependency_directives_scan::Token &Tok = lexToken(First, End);
+    if (Tok.is(tok::eod))
+      break;
+  }
 }
 
-bool Scanner::printMacroArgs(const char *&First, const char *const End) {
-  assert(*First == '(');
-  put(*First++);
-  for (;;) {
-    skipWhitespace(First, End);
-    if (First == End)
-      return true;
+LLVM_NODISCARD Optional<StringRef>
+Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
+  const dependency_directives_scan::Token &Tok = lexToken(First, End);
+  if (Tok.isNot(tok::raw_identifier)) {
+    if (!Tok.is(tok::eod))
+      skipLine(First, End);
+    return None;
+  }
 
-    if (*First == ')') {
-      put(*First++);
-      return false;
-    }
+  bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
+  if (LLVM_LIKELY(!NeedsCleaning))
+    return Input.slice(Tok.Offset, Tok.getEnd());
 
-    // This is intentionally fairly liberal.
-    if (!(isAsciiIdentifierContinue(*First) || *First == '.' || *First == ','))
-      return true;
+  SmallString<64> Spelling;
+  Spelling.resize(Tok.Length);
 
-    printAdjacentMacroArgs(First, End);
+  unsigned SpellingLength = 0;
+  const char *BufPtr = Input.begin() + Tok.Offset;
+  const char *AfterIdent = Input.begin() + Tok.getEnd();
+  while (BufPtr < AfterIdent) {
+    unsigned Size;
+    Spelling[SpellingLength++] =
+        Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+    BufPtr += Size;
   }
+
+  return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
+      .first->first();
 }
 
-/// Looks for an identifier starting from Last.
-///
-/// Updates "First" to just past the next identifier, if any.  Returns true iff
-/// the identifier matches "Id".
-bool Scanner::isNextIdentifier(StringRef Id, const char *&First,
-                               const char *const End) {
-  skipWhitespace(First, End);
-  if (First == End || !isAsciiIdentifierStart(*First))
-    return false;
+StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
+  Optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
+  assert(Id.hasValue() && "expected identifier token");
+  return Id.getValue();
+}
 
-  IdInfo FoundId = lexIdentifier(First, End);
-  First = FoundId.Last;
-  return FoundId.Name == Id;
+bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
+                                         const char *const End) {
+  if (Optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End)) {
+    if (*FoundId == Id)
+      return true;
+    skipLine(First, End);
+  }
+  return false;
 }
 
 bool Scanner::lexAt(const char *&First, const char *const End) {
   // Handle "@import".
-  const char *ImportLoc = First++;
-  if (!isNextIdentifier("import", First, End)) {
-    skipLine(First, End);
-    return false;
-  }
-  pushDirective(decl_at_import);
-  append("@import ");
-  if (printAtImportBody(First, End))
-    return reportError(
-        ImportLoc, diag::err_dep_source_scanner_missing_semi_after_at_import);
-  skipWhitespace(First, End);
-  if (First == End)
+
+  // Lex '@'.
+  const dependency_directives_scan::Token &AtTok = lexToken(First, End);
+  assert(AtTok.is(tok::at));
+  (void)AtTok;
+
+  if (!isNextIdentifierOrSkipLine("import", First, End))
     return false;
-  if (!isVerticalWhitespace(*First))
-    return reportError(
-        ImportLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import);
-  skipNewline(First, End);
-  return false;
+  return lexModuleDirectiveBody(decl_at_import, First, End);
 }
 
 bool Scanner::lexModule(const char *&First, const char *const End) {
-  IdInfo Id = lexIdentifier(First, End);
-  First = Id.Last;
+  StringRef Id = lexIdentifier(First, End);
   bool Export = false;
-  if (Id.Name == "export") {
+  if (Id == "export") {
     Export = true;
-    skipWhitespace(First, End);
-    if (!isAsciiIdentifierContinue(*First)) {
-      skipLine(First, End);
+    Optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End);
+    if (!NextId)
       return false;
-    }
-    Id = lexIdentifier(First, End);
-    First = Id.Last;
+    Id = *NextId;
   }
 
-  if (Id.Name != "module" && Id.Name != "import") {
+  if (Id != "module" && Id != "import") {
     skipLine(First, End);
     return false;
   }
@@ -680,114 +603,51 @@
     }
   }
 
-  if (Export) {
-    pushDirective(cxx_export_decl);
-    append("export ");
-  }
+  TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false);
 
-  if (Id.Name == "module")
-    pushDirective(cxx_module_decl);
+  DirectiveKind Kind;
+  if (Id == "module")
+    Kind = Export ? cxx_export_module_decl : cxx_module_decl;
   else
-    pushDirective(cxx_import_decl);
-  append(Id.Name);
-  append(" ");
-  printToNewline(First, End);
-  append("\n");
-  return false;
-}
-
-bool Scanner::lexDefine(const char *&First, const char *const End) {
-  pushDirective(pp_define);
-  append("#define ");
-  skipWhitespace(First, End);
-
-  if (!isAsciiIdentifierStart(*First))
-    return reportError(First, diag::err_pp_macro_not_identifier);
+    Kind = Export ? cxx_export_import_decl : cxx_import_decl;
 
-  IdInfo Id = lexIdentifier(First, End);
-  const char *Last = Id.Last;
-  append(Id.Name);
-  if (Last == End)
-    return false;
-  if (*Last == '(') {
-    size_t Size = Out.size();
-    if (printMacroArgs(Last, End)) {
-      // Be robust to bad macro arguments, since they can show up in disabled
-      // code.
-      Out.resize(Size);
-      append("(/* invalid */\n");
-      skipLine(Last, End);
-      return false;
-    }
-  }
-  skipWhitespace(Last, End);
-  if (Last == End)
-    return false;
-  if (!isVerticalWhitespace(*Last))
-    put(' ');
-  printDirectiveBody(Last, End);
-  First = Last;
-  return false;
+  return lexModuleDirectiveBody(Kind, First, End);
 }
 
 bool Scanner::lexPragma(const char *&First, const char *const End) {
-  // #pragma.
-  skipWhitespace(First, End);
-  if (First == End || !isAsciiIdentifierStart(*First))
+  Optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
+  if (!FoundId)
     return false;
 
-  IdInfo FoundId = lexIdentifier(First, End);
-  First = FoundId.Last;
-  if (FoundId.Name == "once") {
-    // #pragma once
-    skipLine(First, End);
-    pushDirective(pp_pragma_once);
-    append("#pragma once\n");
-    return false;
-  }
-  if (FoundId.Name == "push_macro") {
-    // #pragma push_macro
-    pushDirective(pp_pragma_push_macro);
-    append("#pragma push_macro");
-    printDirectiveBody(First, End);
-    return false;
-  }
-  if (FoundId.Name == "pop_macro") {
-    // #pragma pop_macro
-    pushDirective(pp_pragma_pop_macro);
-    append("#pragma pop_macro");
-    printDirectiveBody(First, End);
-    return false;
-  }
-  if (FoundId.Name == "include_alias") {
-    // #pragma include_alias
-    pushDirective(pp_pragma_include_alias);
-    append("#pragma include_alias");
-    printDirectiveBody(First, End);
+  StringRef Id = FoundId.getValue();
+  auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
+                  .Case("once", pp_pragma_once)
+                  .Case("push_macro", pp_pragma_push_macro)
+                  .Case("pop_macro", pp_pragma_pop_macro)
+                  .Case("include_alias", pp_pragma_include_alias)
+                  .Default(pp_none);
+  if (Kind != pp_none) {
+    lexPPDirectiveBody(First, End);
+    pushDirective(Kind);
     return false;
   }
 
-  if (FoundId.Name != "clang") {
+  if (Id != "clang") {
     skipLine(First, End);
     return false;
   }
 
   // #pragma clang.
-  if (!isNextIdentifier("module", First, End)) {
-    skipLine(First, End);
+  if (!isNextIdentifierOrSkipLine("module", First, End))
     return false;
-  }
 
   // #pragma clang module.
-  if (!isNextIdentifier("import", First, End)) {
-    skipLine(First, End);
+  if (!isNextIdentifierOrSkipLine("import", First, End))
     return false;
-  }
 
   // #pragma clang module import.
+  lexPPDirectiveBody(First, End);
   pushDirective(pp_pragma_import);
-  append("#pragma clang module import ");
-  printDirectiveBody(First, End);
   return false;
 }
 
@@ -808,14 +668,13 @@
     return false;
   }
 
-  return lexDefault(pp_endif, "endif", First, End);
+  return lexDefault(pp_endif, First, End);
 }
 
-bool Scanner::lexDefault(DirectiveKind Kind, StringRef Directive,
-                         const char *&First, const char *const End) {
+bool Scanner::lexDefault(DirectiveKind Kind, const char *&First,
+                         const char *const End) {
+  lexPPDirectiveBody(First, End);
   pushDirective(Kind);
-  put('#').append(Directive).put(' ');
-  printDirectiveBody(First, End);
   return false;
 }
 
@@ -845,6 +704,14 @@
     return false;
   }
 
+  TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true);
+
+  auto ScEx1 = make_scope_exit([&]() {
+    /// Clear Scanner's CurDirToks before returning, in case we didn't push a
+    /// new directive.
+    CurDirToks.clear();
+  });
+
   // Handle "@import".
   if (*First == '@')
     return lexAt(First, End);
@@ -853,25 +720,26 @@
     return lexModule(First, End);
 
   // Handle preprocessing directives.
-  ++First; // Skip over '#'.
-  skipWhitespace(First, End);
 
-  if (First == End)
-    return reportError(First, diag::err_pp_expected_eol);
+  TheLexer.setParsingPreprocessorDirective(true);
+  auto ScEx2 = make_scope_exit(
+      [&]() { TheLexer.setParsingPreprocessorDirective(false); });
 
-  if (!isAsciiIdentifierStart(*First)) {
-    skipLine(First, End);
+  // Lex '#'.
+  const dependency_directives_scan::Token &HashTok = lexToken(First, End);
+  assert(HashTok.is(tok::hash));
+  (void)HashTok;
+
+  Optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
+  if (!FoundId)
     return false;
-  }
 
-  // Figure out the token.
-  IdInfo Id = lexIdentifier(First, End);
-  First = Id.Last;
+  StringRef Id = FoundId.getValue();
 
-  if (Id.Name == "pragma")
+  if (Id == "pragma")
     return lexPragma(First, End);
 
-  auto Kind = llvm::StringSwitch<DirectiveKind>(Id.Name)
+  auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
                   .Case("include", pp_include)
                   .Case("__include_macros", pp___include_macros)
                   .Case("define", pp_define)
@@ -888,18 +756,26 @@
                   .Case("endif", pp_endif)
                   .Default(pp_none);
   if (Kind == pp_none) {
-    skipDirective(Id.Name, First, End);
+    skipDirective(Id, First, End);
     return false;
   }
 
   if (Kind == pp_endif)
     return lexEndif(First, End);
 
-  if (Kind == pp_define)
-    return lexDefine(First, End);
+  switch (Kind) {
+  case pp_include:
+  case pp___include_macros:
+  case pp_include_next:
+  case pp_import:
+    lexIncludeFilename(First, End);
+    break;
+  default:
+    break;
+  }
 
   // Everything else.
-  return lexDefault(Kind, Id.Name, First, End);
+  return lexDefault(Kind, First, End);
 }
 
 static void skipUTF8ByteOrderMark(const char *&First, const char *const End) {
@@ -916,28 +792,65 @@
   return false;
 }
 
-bool Scanner::scan() {
+bool Scanner::scan(SmallVectorImpl<Directive> &Directives) {
   bool Error = scanImpl(Input.begin(), Input.end());
 
   if (!Error) {
-    // Add a trailing newline and an EOF on success.
-    if (!Out.empty() && Out.back() != '\n')
-      Out.push_back('\n');
+    // Add an EOF on success.
     pushDirective(pp_eof);
   }
 
-  // Null-terminate the output. This way the memory buffer that's passed to
-  // Clang will not have to worry about the terminating '\0'.
-  Out.push_back(0);
-  Out.pop_back();
+  ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens;
+  for (const DirectiveWithTokens &DirWithToks : DirsWithToks) {
+    assert(RemainingTokens.size() >= DirWithToks.NumTokens);
+    Directives.emplace_back(DirWithToks.Kind,
+                            RemainingTokens.take_front(DirWithToks.NumTokens));
+    RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens);
+  }
+  assert(RemainingTokens.empty());
+
   return Error;
 }
 
 bool clang::scanSourceForDependencyDirectives(
-    StringRef Input, SmallVectorImpl<char> &Output,
+    StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
     SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags,
     SourceLocation InputSourceLoc) {
-  Output.clear();
-  Directives.clear();
-  return Scanner(Output, Directives, Input, Diags, InputSourceLoc).scan();
+  return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives);
+}
+
+void clang::printDependencyDirectivesAsSource(
+    StringRef Source,
+    ArrayRef<dependency_directives_scan::Directive> Directives,
+    llvm::raw_ostream &OS) {
+  // Add a space separator where it is convenient for testing purposes.
+  auto needsSpaceSeparator =
+      [](tok::TokenKind Prev,
+         const dependency_directives_scan::Token &Tok) -> bool {
+    if (Prev == Tok.Kind)
+      return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square,
+                          tok::r_square);
+    if (Prev == tok::raw_identifier &&
+        Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal,
+                    tok::char_constant, tok::header_name))
+      return true;
+    if (Prev == tok::r_paren &&
+        Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal,
+                    tok::char_constant, tok::unknown))
+      return true;
+    if (Prev == tok::comma &&
+        Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less))
+      return true;
+    return false;
+  };
+
+  for (const dependency_directives_scan::Directive &Directive : Directives) {
+    Optional<tok::TokenKind> PrevTokenKind;
+    for (const dependency_directives_scan::Token &Tok : Directive.Tokens) {
+      if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok))
+        OS << ' ';
+      PrevTokenKind = Tok.Kind;
+      OS << Source.slice(Tok.Offset, Tok.getEnd());
+    }
+  }
 }
Index: clang/lib/Frontend/FrontendActions.cpp
===================================================================
--- clang/lib/Frontend/FrontendActions.cpp
+++ clang/lib/Frontend/FrontendActions.cpp
@@ -1157,10 +1157,10 @@
   SourceManager &SM = CI.getPreprocessor().getSourceManager();
   llvm::MemoryBufferRef FromFile = SM.getBufferOrFake(SM.getMainFileID());
 
-  llvm::SmallString<1024> Output;
+  llvm::SmallVector<dependency_directives_scan::Token, 16> Tokens;
   llvm::SmallVector<dependency_directives_scan::Directive, 32> Directives;
   if (scanSourceForDependencyDirectives(
-          FromFile.getBuffer(), Output, Directives, &CI.getDiagnostics(),
+          FromFile.getBuffer(), Tokens, Directives, &CI.getDiagnostics(),
           SM.getLocForStartOfFile(SM.getMainFileID()))) {
     assert(CI.getDiagnostics().hasErrorOccurred() &&
            "no errors reported for failure");
@@ -1179,7 +1179,8 @@
     }
     return;
   }
-  llvm::outs() << Output;
+  printDependencyDirectivesAsSource(FromFile.getBuffer(), Directives,
+                                    llvm::outs());
 }
 
 void GetDependenciesByModuleNameAction::ExecuteAction() {
Index: clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
===================================================================
--- clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
+++ clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGFILESYSTEM_H
 
 #include "clang/Basic/LLVM.h"
+#include "clang/Lex/DependencyDirectivesScanner.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Allocator.h"
@@ -21,21 +22,26 @@
 namespace tooling {
 namespace dependencies {
 
-/// Original and minimized contents of a cached file entry. Single instance can
+using DependencyDirectivesTy =
+    SmallVector<dependency_directives_scan::Directive, 20>;
+
+/// Contents and directive tokens of a cached file entry. Single instance can
 /// be shared between multiple entries.
 struct CachedFileContents {
-  CachedFileContents(std::unique_ptr<llvm::MemoryBuffer> Original)
-      : Original(std::move(Original)), MinimizedAccess(nullptr) {}
+  CachedFileContents(std::unique_ptr<llvm::MemoryBuffer> Contents)
+      : Original(std::move(Contents)), DepDirectives(nullptr) {}
 
   /// Owning storage for the original contents.
   std::unique_ptr<llvm::MemoryBuffer> Original;
 
   /// The mutex that must be locked before mutating directive tokens.
   std::mutex ValueLock;
-  /// Owning storage for the minimized contents.
-  std::unique_ptr<llvm::MemoryBuffer> MinimizedStorage;
+  SmallVector<dependency_directives_scan::Token, 10> DepDirectiveTokens;
   /// Accessor to the directive tokens that's atomic to avoid data races.
-  std::atomic<llvm::MemoryBuffer *> MinimizedAccess;
+  /// \p CachedFileContents has ownership of the pointer.
+  std::atomic<const Optional<DependencyDirectivesTy> *> DepDirectives;
+
+  ~CachedFileContents() { delete DepDirectives.load(); }
 };
 
 /// An in-memory representation of a file system entity that is of interest to
@@ -82,13 +88,17 @@
 
   /// \returns The scanned preprocessor directive tokens of the file that are
   /// used to speed up preprocessing, if available.
-  StringRef getDirectiveTokens() const {
+  Optional<ArrayRef<dependency_directives_scan::Directive>>
+  getDirectiveTokens() const {
     assert(!isError() && "error");
-    assert(!MaybeStat->isDirectory() && "not a file");
+    assert(!isDirectory() && "not a file");
     assert(Contents && "contents not initialized");
-    llvm::MemoryBuffer *Buffer = Contents->MinimizedAccess.load();
-    assert(Buffer && "not minimized");
-    return Buffer->getBuffer();
+    if (auto *Directives = Contents->DepDirectives.load()) {
+      if (Directives->hasValue())
+        return ArrayRef<dependency_directives_scan::Directive>(
+            Directives->getValue());
+    }
+    return None;
   }
 
   /// \returns The error.
@@ -224,10 +234,6 @@
 /// If the underlying entry is an opened file, this wrapper returns the file
 /// contents and the scanned preprocessor directives.
 class EntryRef {
-  /// For entry that is an opened file, this bit signifies whether its contents
-  /// are minimized.
-  bool Minimized;
-
   /// The filename used to access this entry.
   std::string Filename;
 
@@ -235,8 +241,8 @@
   const CachedFileSystemEntry &Entry;
 
 public:
-  EntryRef(bool Minimized, StringRef Name, const CachedFileSystemEntry &Entry)
-      : Minimized(Minimized), Filename(Name), Entry(Entry) {}
+  EntryRef(StringRef Name, const CachedFileSystemEntry &Entry)
+      : Filename(Name), Entry(Entry) {}
 
   llvm::vfs::Status getStatus() const {
     llvm::vfs::Status Stat = Entry.getStatus();
@@ -255,8 +261,11 @@
     return *this;
   }
 
-  StringRef getContents() const {
-    return Minimized ? Entry.getDirectiveTokens() : Entry.getOriginalContents();
+  StringRef getContents() const { return Entry.getOriginalContents(); }
+
+  Optional<ArrayRef<dependency_directives_scan::Directive>>
+  getDirectiveTokens() const {
+    return Entry.getDirectiveTokens();
   }
 };
 
Index: clang/include/clang/Lex/Lexer.h
===================================================================
--- clang/include/clang/Lex/Lexer.h
+++ clang/include/clang/Lex/Lexer.h
@@ -288,14 +288,8 @@
     return BufferPtr - BufferStart;
   }
 
-  /// Skip over \p NumBytes bytes.
-  ///
-  /// If the skip is successful, the next token will be lexed from the new
-  /// offset. The lexer also assumes that we skipped to the start of the line.
-  ///
-  /// \returns true if the skip failed (new offset would have been past the
-  /// end of the buffer), false otherwise.
-  bool skipOver(unsigned NumBytes);
+  /// Set the lexer's buffer pointer to \p Offset.
+  void seek(unsigned Offset, bool IsAtStartOfLine);
 
   /// Stringify - Convert the specified string into a C string by i) escaping
   /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
Index: clang/include/clang/Lex/DependencyDirectivesScanner.h
===================================================================
--- clang/include/clang/Lex/DependencyDirectivesScanner.h
+++ clang/include/clang/Lex/DependencyDirectivesScanner.h
@@ -19,15 +19,41 @@
 
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 
 namespace clang {
 
+namespace tok {
+enum TokenKind : unsigned short;
+}
+
 class DiagnosticsEngine;
 
 namespace dependency_directives_scan {
 
+/// Token lexed as part of dependency directive scanning.
+struct Token {
+  /// Offset into the original source input.
+  unsigned Offset;
+  unsigned Length;
+  tok::TokenKind Kind;
+  unsigned short Flags;
+
+  Token(unsigned Offset, unsigned Length, tok::TokenKind Kind,
+        unsigned short Flags)
+      : Offset(Offset), Length(Length), Kind(Kind), Flags(Flags) {}
+
+  unsigned getEnd() const { return Offset + Length; }
+
+  bool is(tok::TokenKind K) const { return Kind == K; }
+  bool isNot(tok::TokenKind K) const { return Kind != K; }
+  bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const {
+    return is(K1) || is(K2);
+  }
+  template <typename... Ts> bool isOneOf(tok::TokenKind K1, Ts... Ks) const {
+    return is(K1) || isOneOf(Ks...);
+  }
+};
+
 /// Represents the kind of preprocessor directive or a module declaration that
 /// is tracked by the scanner in its token output.
 enum DirectiveKind : uint8_t {
@@ -52,9 +78,10 @@
   pp_else,
   pp_endif,
   decl_at_import,
-  cxx_export_decl,
   cxx_module_decl,
   cxx_import_decl,
+  cxx_export_module_decl,
+  cxx_export_import_decl,
   pp_eof,
 };
 
@@ -62,35 +89,48 @@
 /// scanning. It's used to track various preprocessor directives that could
 /// potentially have an effect on the depedencies.
 struct Directive {
+  ArrayRef<Token> Tokens;
+
   /// The kind of token.
   DirectiveKind Kind = pp_none;
 
-  /// Offset into the output byte stream of where the directive begins.
-  int Offset = -1;
-
-  Directive(DirectiveKind K, int Offset) : Kind(K), Offset(Offset) {}
+  Directive() = default;
+  Directive(DirectiveKind K, ArrayRef<Token> Tokens)
+      : Tokens(Tokens), Kind(K) {}
 };
 
 } // end namespace dependency_directives_scan
 
-/// Minimize the input down to the preprocessor directives that might have
+/// Scan the input for the preprocessor directives that might have
 /// an effect on the dependencies for a compilation unit.
 ///
-/// This function deletes all non-preprocessor code, and strips anything that
-/// can't affect what gets included. It canonicalizes whitespace where
-/// convenient to stabilize the output against formatting changes in the input.
-///
-/// Clears the output vectors at the beginning of the call.
+/// This function ignores all non-preprocessor code and anything that
+/// can't affect what gets included.
 ///
 /// \returns false on success, true on error. If the diagnostic engine is not
 /// null, an appropriate error is reported using the given input location
-/// with the offset that corresponds to the minimizer's current buffer offset.
+/// with the offset that corresponds to the \p Input buffer offset.
 bool scanSourceForDependencyDirectives(
-    llvm::StringRef Input, llvm::SmallVectorImpl<char> &Output,
-    llvm::SmallVectorImpl<dependency_directives_scan::Directive> &Directives,
+    StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
+    SmallVectorImpl<dependency_directives_scan::Directive> &Directives,
     DiagnosticsEngine *Diags = nullptr,
     SourceLocation InputSourceLoc = SourceLocation());
 
+/// Print the previously scanned dependency directives as minimized source text.
+///
+/// \param Source The original source text that the dependency directives were
+/// scanned from.
+/// \param Directives The previously scanned dependency
+/// directives.
+/// \param OS the stream to print the dependency directives on.
+///
+/// This is used primarily for testing purposes, during dependency scanning the
+/// \p Lexer uses the tokens directly, not their printed version.
+void printDependencyDirectivesAsSource(
+    StringRef Source,
+    ArrayRef<dependency_directives_scan::Directive> Directives,
+    llvm::raw_ostream &OS);
+
 } // end namespace clang
 
 #endif // LLVM_CLANG_LEX_DEPENDENCYDIRECTIVESSCANNER_H

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D125487: [Tooling/DependencyScanning] Refactor dependency scanning to produce pre-lexed preprocessor directive tokens, instead of minimized sources

Reply via email to