akyrtzi updated this revision to Diff 429766. akyrtzi added a comment. Make sure to enable line comments for dependency directive lexing.
Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D125487/new/ https://reviews.llvm.org/D125487 Files: clang/include/clang/Lex/DependencyDirectivesScanner.h clang/include/clang/Lex/Lexer.h clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h clang/lib/Frontend/FrontendActions.cpp clang/lib/Lex/DependencyDirectivesScanner.cpp clang/lib/Lex/Lexer.cpp clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c clang/test/Lexer/minimize_source_to_dependency_directives_pragmas.c clang/unittests/Lex/DependencyDirectivesScannerTest.cpp clang/unittests/Tooling/DependencyScannerTest.cpp
Index: clang/unittests/Tooling/DependencyScannerTest.cpp =================================================================== --- clang/unittests/Tooling/DependencyScannerTest.cpp +++ clang/unittests/Tooling/DependencyScannerTest.cpp @@ -204,51 +204,5 @@ EXPECT_EQ(convert_to_slash(Deps[5]), "/root/symlink.h"); } -namespace dependencies { -TEST(DependencyScanningFilesystem, IgnoredFilesAreCachedSeparately1) { - auto VFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>(); - VFS->addFile("/mod.h", 0, - llvm::MemoryBuffer::getMemBuffer("#include <foo.h>\n" - "// hi there!\n")); - - DependencyScanningFilesystemSharedCache SharedCache; - DependencyScanningWorkerFilesystem DepFS(SharedCache, VFS); - - DepFS.enableDirectivesScanningOfAllFiles(); // Let's be explicit for clarity. - auto StatusMinimized0 = DepFS.status("/mod.h"); - DepFS.disableDirectivesScanning("/mod.h"); - auto StatusFull1 = DepFS.status("/mod.h"); - - EXPECT_TRUE(StatusMinimized0); - EXPECT_TRUE(StatusFull1); - EXPECT_EQ(StatusMinimized0->getSize(), 17u); - EXPECT_EQ(StatusFull1->getSize(), 30u); - EXPECT_EQ(StatusMinimized0->getName(), StringRef("/mod.h")); - EXPECT_EQ(StatusFull1->getName(), StringRef("/mod.h")); -} - -TEST(DependencyScanningFilesystem, IgnoredFilesAreCachedSeparately2) { - auto VFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>(); - VFS->addFile("/mod.h", 0, - llvm::MemoryBuffer::getMemBuffer("#include <foo.h>\n" - "// hi there!\n")); - - DependencyScanningFilesystemSharedCache SharedCache; - DependencyScanningWorkerFilesystem DepFS(SharedCache, VFS); - - DepFS.disableDirectivesScanning("/mod.h"); - auto StatusFull0 = DepFS.status("/mod.h"); - DepFS.enableDirectivesScanningOfAllFiles(); - auto StatusMinimized1 = DepFS.status("/mod.h"); - - EXPECT_TRUE(StatusFull0); - EXPECT_TRUE(StatusMinimized1); - EXPECT_EQ(StatusFull0->getSize(), 30u); - EXPECT_EQ(StatusMinimized1->getSize(), 17u); - EXPECT_EQ(StatusFull0->getName(), StringRef("/mod.h")); - EXPECT_EQ(StatusMinimized1->getName(), StringRef("/mod.h")); -} - -} // end namespace dependencies } // end namespace tooling } // end namespace clang Index: clang/unittests/Lex/DependencyDirectivesScannerTest.cpp =================================================================== --- clang/unittests/Lex/DependencyDirectivesScannerTest.cpp +++ clang/unittests/Lex/DependencyDirectivesScannerTest.cpp @@ -14,39 +14,58 @@ using namespace clang; using namespace clang::dependency_directives_scan; -static bool minimizeSourceToDependencyDirectives(StringRef Input, - SmallVectorImpl<char> &Out) { - SmallVector<dependency_directives_scan::Directive, 32> Directives; - return scanSourceForDependencyDirectives(Input, Out, Directives); +static bool minimizeSourceToDependencyDirectives( + StringRef Input, SmallVectorImpl<char> &Out, + SmallVectorImpl<dependency_directives_scan::Token> &Tokens, + SmallVectorImpl<Directive> &Directives) { + Out.clear(); + Tokens.clear(); + Directives.clear(); + if (scanSourceForDependencyDirectives(Input, Tokens, Directives)) + return true; + + raw_svector_ostream OS(Out); + printDependencyDirectivesAsSource(Input, Directives, OS); + if (!Out.empty() && Out.back() != '\n') + Out.push_back('\n'); + Out.push_back('\0'); + Out.pop_back(); + + return false; } -static bool -minimizeSourceToDependencyDirectives(StringRef Input, - SmallVectorImpl<char> &Out, - SmallVectorImpl<Directive> &Directives) { - return scanSourceForDependencyDirectives(Input, Out, Directives); +static bool minimizeSourceToDependencyDirectives(StringRef Input, + SmallVectorImpl<char> &Out) { + SmallVector<dependency_directives_scan::Token, 16> Tokens; + SmallVector<Directive, 32> Directives; + return minimizeSourceToDependencyDirectives(Input, Out, Tokens, Directives); } namespace { TEST(MinimizeSourceToDependencyDirectivesTest, Empty) { SmallVector<char, 128> Out; + SmallVector<dependency_directives_scan::Token, 4> Tokens; SmallVector<Directive, 4> Directives; - ASSERT_FALSE(minimizeSourceToDependencyDirectives("", Out, Directives)); + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("", Out, Tokens, Directives)); EXPECT_TRUE(Out.empty()); + EXPECT_TRUE(Tokens.empty()); ASSERT_EQ(1u, Directives.size()); ASSERT_EQ(pp_eof, Directives.back().Kind); - ASSERT_FALSE( - minimizeSourceToDependencyDirectives("abc def\nxyz", Out, Directives)); + ASSERT_FALSE(minimizeSourceToDependencyDirectives("abc def\nxyz", Out, Tokens, + Directives)); EXPECT_TRUE(Out.empty()); + EXPECT_TRUE(Tokens.empty()); ASSERT_EQ(1u, Directives.size()); ASSERT_EQ(pp_eof, Directives.back().Kind); } -TEST(MinimizeSourceToDependencyDirectivesTest, AllDirectives) { +TEST(MinimizeSourceToDependencyDirectivesTest, AllTokens) { SmallVector<char, 128> Out; + SmallVector<dependency_directives_scan::Token, 4> Tokens; SmallVector<Directive, 4> Directives; ASSERT_FALSE( @@ -71,7 +90,7 @@ "#pragma include_alias(<A>, <B>)\n" "export module m;\n" "import m;\n", - Out, Directives)); + Out, Tokens, Directives)); EXPECT_EQ(pp_define, Directives[0].Kind); EXPECT_EQ(pp_undef, Directives[1].Kind); EXPECT_EQ(pp_endif, Directives[2].Kind); @@ -91,19 +110,28 @@ EXPECT_EQ(pp_pragma_push_macro, Directives[16].Kind); EXPECT_EQ(pp_pragma_pop_macro, Directives[17].Kind); EXPECT_EQ(pp_pragma_include_alias, Directives[18].Kind); - EXPECT_EQ(cxx_export_decl, Directives[19].Kind); - EXPECT_EQ(cxx_module_decl, Directives[20].Kind); - EXPECT_EQ(cxx_import_decl, Directives[21].Kind); - EXPECT_EQ(pp_eof, Directives[22].Kind); + EXPECT_EQ(cxx_export_module_decl, Directives[19].Kind); + EXPECT_EQ(cxx_import_decl, Directives[20].Kind); + EXPECT_EQ(pp_eof, Directives[21].Kind); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, EmptyHash) { + SmallVector<char, 128> Out; + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#\n#define MACRO a\n", Out)); + EXPECT_STREQ("#define MACRO a\n", Out.data()); } TEST(MinimizeSourceToDependencyDirectivesTest, Define) { SmallVector<char, 128> Out; + SmallVector<dependency_directives_scan::Token, 4> Tokens; SmallVector<Directive, 4> Directives; - ASSERT_FALSE( - minimizeSourceToDependencyDirectives("#define MACRO", Out, Directives)); + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO", Out, + Tokens, Directives)); EXPECT_STREQ("#define MACRO\n", Out.data()); + ASSERT_EQ(4u, Tokens.size()); ASSERT_EQ(2u, Directives.size()); ASSERT_EQ(pp_define, Directives.front().Kind); } @@ -144,25 +172,25 @@ ASSERT_FALSE(minimizeSourceToDependencyDirectives( "#define MACRO con tent ", Out)); - EXPECT_STREQ("#define MACRO con tent\n", Out.data()); + EXPECT_STREQ("#define MACRO con tent\n", Out.data()); ASSERT_FALSE(minimizeSourceToDependencyDirectives( "#define MACRO() con tent ", Out)); - EXPECT_STREQ("#define MACRO() con tent\n", Out.data()); + EXPECT_STREQ("#define MACRO() con tent\n", Out.data()); } TEST(MinimizeSourceToDependencyDirectivesTest, DefineInvalidMacroArguments) { SmallVector<char, 128> Out; ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO((a))", Out)); - EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data()); + EXPECT_STREQ("#define MACRO((a))\n", Out.data()); ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO(", Out)); - EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data()); + EXPECT_STREQ("#define MACRO(\n", Out.data()); ASSERT_FALSE( minimizeSourceToDependencyDirectives("#define MACRO(a * b)", Out)); - EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data()); + EXPECT_STREQ("#define MACRO(a*b)\n", Out.data()); } TEST(MinimizeSourceToDependencyDirectivesTest, DefineHorizontalWhitespace) { @@ -170,19 +198,19 @@ ASSERT_FALSE(minimizeSourceToDependencyDirectives( "#define MACRO(\t)\tcon \t tent\t", Out)); - EXPECT_STREQ("#define MACRO() con \t tent\n", Out.data()); + EXPECT_STREQ("#define MACRO() con tent\n", Out.data()); ASSERT_FALSE(minimizeSourceToDependencyDirectives( "#define MACRO(\f)\fcon \f tent\f", Out)); - EXPECT_STREQ("#define MACRO() con \f tent\n", Out.data()); + EXPECT_STREQ("#define MACRO() con tent\n", Out.data()); ASSERT_FALSE(minimizeSourceToDependencyDirectives( "#define MACRO(\v)\vcon \v tent\v", Out)); - EXPECT_STREQ("#define MACRO() con \v tent\n", Out.data()); + EXPECT_STREQ("#define MACRO() con tent\n", Out.data()); ASSERT_FALSE(minimizeSourceToDependencyDirectives( "#define MACRO \t\v\f\v\t con\f\t\vtent\v\f \v", Out)); - EXPECT_STREQ("#define MACRO con\f\t\vtent\n", Out.data()); + EXPECT_STREQ("#define MACRO con tent\n", Out.data()); } TEST(MinimizeSourceToDependencyDirectivesTest, DefineMultilineArgs) { @@ -255,25 +283,27 @@ TEST(MinimizeSourceToDependencyDirectivesTest, DefineNumber) { SmallVector<char, 128> Out; - ASSERT_TRUE(minimizeSourceToDependencyDirectives("#define 0\n", Out)); + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define 0\n", Out)); } TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoName) { SmallVector<char, 128> Out; - ASSERT_TRUE(minimizeSourceToDependencyDirectives("#define &\n", Out)); + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define &\n", Out)); } TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoWhitespace) { SmallVector<char, 128> Out; ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND&\n", Out)); - EXPECT_STREQ("#define AND &\n", Out.data()); + EXPECT_STREQ("#define AND&\n", Out.data()); ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND\\\n" "&\n", Out)); - EXPECT_STREQ("#define AND &\n", Out.data()); + EXPECT_STREQ("#define AND\\\n" + "&\n", + Out.data()); } TEST(MinimizeSourceToDependencyDirectivesTest, MultilineComment) { @@ -303,6 +333,14 @@ Out.data()); } +TEST(MinimizeSourceToDependencyDirectivesTest, CommentSlashSlashStar) { + SmallVector<char, 128> Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives( + "#define MACRO 1 //* blah */\n", Out)); + EXPECT_STREQ("#define MACRO 1\n", Out.data()); +} + TEST(MinimizeSourceToDependencyDirectivesTest, Ifdef) { SmallVector<char, 128> Out; @@ -481,6 +519,9 @@ ASSERT_FALSE( minimizeSourceToDependencyDirectives("#__include_macros <A>\n", Out)); EXPECT_STREQ("#__include_macros <A>\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#include MACRO\n", Out)); + EXPECT_STREQ("#include MACRO\n", Out.data()); } TEST(MinimizeSourceToDependencyDirectivesTest, AtImport) { @@ -507,8 +548,9 @@ SmallVector<char, 128> Out; ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import A\n", Out)); - ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import MACRO(A);\n", Out)); - ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import \" \";\n", Out)); + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("@import MACRO(A);\n", Out)); + ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import \" \";\n", Out)); } TEST(MinimizeSourceToDependencyDirectivesTest, RawStringLiteral) { @@ -559,7 +601,8 @@ "#define GUARD\n" "#endif\n", Out)); - EXPECT_STREQ("#ifndef GUARD\n" + EXPECT_STREQ("#if\\\n" + "ndef GUARD\n" "#define GUARD\n" "#endif\n", Out.data()); @@ -567,12 +610,16 @@ ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\n" "RD\n", Out)); - EXPECT_STREQ("#define GUARD\n", Out.data()); + EXPECT_STREQ("#define GUA\\\n" + "RD\n", + Out.data()); ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\r" "RD\n", Out)); - EXPECT_STREQ("#define GUARD\n", Out.data()); + EXPECT_STREQ("#define GUA\\\r" + "RD\n", + Out.data()); ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\n" " RD\n", @@ -588,7 +635,10 @@ "2 + \\\t\n" "3\n", Out)); - EXPECT_STREQ("#define A 1 + 2 + 3\n", Out.data()); + EXPECT_STREQ("#define A 1+\\ \n" + "2+\\\t\n" + "3\n", + Out.data()); } TEST(MinimizeSourceToDependencyDirectivesTest, PoundWarningAndError) { @@ -682,6 +732,7 @@ TEST(MinimizeSourceToDependencyDirectivesTest, PragmaOnce) { SmallVector<char, 128> Out; + SmallVector<dependency_directives_scan::Token, 4> Tokens; SmallVector<Directive, 4> Directives; StringRef Source = R"(// comment @@ -689,7 +740,8 @@ // another comment #include <test.h> )"; - ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out, Directives)); + ASSERT_FALSE( + minimizeSourceToDependencyDirectives(Source, Out, Tokens, Directives)); EXPECT_STREQ("#pragma once\n#include <test.h>\n", Out.data()); ASSERT_EQ(Directives.size(), 3u); EXPECT_EQ(Directives[0].Kind, dependency_directives_scan::pp_pragma_once); @@ -700,7 +752,7 @@ #include <test.h> )"; ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out)); - EXPECT_STREQ("#pragma once\n#include <test.h>\n", Out.data()); + EXPECT_STREQ("#pragma once extra tokens\n#include <test.h>\n", Out.data()); } TEST(MinimizeSourceToDependencyDirectivesTest, @@ -755,11 +807,12 @@ Source = "#define X \"\\ \r\nx\n#include <x>\n"; ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out)); - EXPECT_STREQ("#define X \"\\ \r\nx\n#include <x>\n", Out.data()); + EXPECT_STREQ("#define X\"\\ \r\nx\n#include <x>\n", Out.data()); } TEST(MinimizeSourceToDependencyDirectivesTest, CxxModules) { SmallVector<char, 128> Out; + SmallVector<dependency_directives_scan::Token, 4> Tokens; SmallVector<Directive, 4> Directives; StringRef Source = R"( @@ -789,16 +842,17 @@ import f(->a = 3); } )"; - ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out, Directives)); - EXPECT_STREQ("#include \"textual-header.h\"\nexport module m;\n" - "export import :l [[rename]];\n" - "import <<= 3;\nimport a b d e d e f e;\n" - "import foo [[no_unique_address]];\nimport foo();\n" - "import f(:sefse);\nimport f(->a = 3);\n", + ASSERT_FALSE( + minimizeSourceToDependencyDirectives(Source, Out, Tokens, Directives)); + EXPECT_STREQ("#include \"textual-header.h\"\nexport module m;" + "exp\\\nort import:l[[rename]];" + "import<<=3;import a b d e d e f e;" + "import foo[[no_unique_address]];import foo();" + "import f(:sefse);import f(->a=3);\n", Out.data()); - ASSERT_EQ(Directives.size(), 12u); - EXPECT_EQ(Directives[0].Kind, dependency_directives_scan::pp_include); - EXPECT_EQ(Directives[2].Kind, dependency_directives_scan::cxx_module_decl); + ASSERT_EQ(Directives.size(), 10u); + EXPECT_EQ(Directives[0].Kind, pp_include); + EXPECT_EQ(Directives[1].Kind, cxx_export_module_decl); } } // end anonymous namespace Index: clang/test/Lexer/minimize_source_to_dependency_directives_pragmas.c =================================================================== --- clang/test/Lexer/minimize_source_to_dependency_directives_pragmas.c +++ clang/test/Lexer/minimize_source_to_dependency_directives_pragmas.c @@ -15,7 +15,7 @@ #pragma include_alias(<string>, "mystring.h") // CHECK: #pragma once -// CHECK-NEXT: #pragma push_macro( "MYMACRO" ) +// CHECK-NEXT: #pragma push_macro("MYMACRO") // CHECK-NEXT: #pragma pop_macro("MYMACRO") // CHECK-NEXT: #pragma clang module import mymodule // CHECK-NEXT: #pragma include_alias(<string>, "mystring.h") Index: clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c =================================================================== --- clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c +++ clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c @@ -1,3 +1,4 @@ -// RUN: %clang_cc1 -verify -print-dependency-directives-minimized-source %s 2>&1 +// RUN: %clang_cc1 -print-dependency-directives-minimized-source %s 2>&1 | FileCheck %s -#define 0 0 // expected-error {{macro name must be an identifier}} +#define 0 0 +// CHECK: #define 0 0 Index: clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp =================================================================== --- clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp +++ clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h" -#include "clang/Lex/DependencyDirectivesScanner.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SmallVectorMemoryBuffer.h" #include "llvm/Support/Threading.h" @@ -45,47 +44,40 @@ const CachedFileSystemEntry &Entry, StringRef Filename, bool Disable) { if (Entry.isError() || Entry.isDirectory() || Disable || !shouldScanForDirectives(Filename, Entry.getUniqueID())) - return EntryRef(/*Minimized=*/false, Filename, Entry); + return EntryRef(Filename, Entry); CachedFileContents *Contents = Entry.getCachedContents(); assert(Contents && "contents not initialized"); // Double-checked locking. - if (Contents->MinimizedAccess.load()) - return EntryRef(/*Minimized=*/true, Filename, Entry); + if (Contents->DepDirectives.load()) + return EntryRef(Filename, Entry); std::lock_guard<std::mutex> GuardLock(Contents->ValueLock); // Double-checked locking. - if (Contents->MinimizedAccess.load()) - return EntryRef(/*Minimized=*/true, Filename, Entry); + if (Contents->DepDirectives.load()) + return EntryRef(Filename, Entry); - llvm::SmallString<1024> MinimizedFileContents; - // Minimize the file down to directives that might affect the dependencies. - SmallVector<dependency_directives_scan::Directive, 64> Tokens; + SmallVector<dependency_directives_scan::Directive, 64> Directives; + // Scan the file for preprocessor directives that might affect the + // dependencies. if (scanSourceForDependencyDirectives(Contents->Original->getBuffer(), - MinimizedFileContents, Tokens)) { + Contents->DepDirectiveTokens, + Directives)) { + Contents->DepDirectiveTokens.clear(); // FIXME: Propagate the diagnostic if desired by the client. - // Use the original file if the minimization failed. - Contents->MinimizedStorage = - llvm::MemoryBuffer::getMemBuffer(*Contents->Original); - Contents->MinimizedAccess.store(Contents->MinimizedStorage.get()); - return EntryRef(/*Minimized=*/true, Filename, Entry); + Contents->DepDirectives.store(new Optional<DependencyDirectivesTy>()); + return EntryRef(Filename, Entry); } - // The contents produced by the minimizer must be null terminated. - assert(MinimizedFileContents.data()[MinimizedFileContents.size()] == '\0' && - "not null terminated contents"); - - Contents->MinimizedStorage = std::make_unique<llvm::SmallVectorMemoryBuffer>( - std::move(MinimizedFileContents)); - // This function performed double-checked locking using `MinimizedAccess`. - // Assigning it must be the last thing this function does. If we were to - // assign it before `PPSkippedRangeMapping`, other threads may skip the - // critical section (`MinimizedAccess != nullptr`) and access the mappings - // that are about to be initialized, leading to a data race. - Contents->MinimizedAccess.store(Contents->MinimizedStorage.get()); - return EntryRef(/*Minimized=*/true, Filename, Entry); + // This function performed double-checked locking using `DepDirectives`. + // Assigning it must be the last thing this function does, otherwise other + // threads may skip the + // critical section (`DepDirectives != nullptr`), leading to a data race. + Contents->DepDirectives.store( + new Optional<DependencyDirectivesTy>(std::move(Directives))); + return EntryRef(Filename, Entry); } DependencyScanningFilesystemSharedCache:: Index: clang/lib/Lex/Lexer.cpp =================================================================== --- clang/lib/Lex/Lexer.cpp +++ clang/lib/Lex/Lexer.cpp @@ -226,13 +226,11 @@ return L; } -bool Lexer::skipOver(unsigned NumBytes) { - IsAtPhysicalStartOfLine = true; - IsAtStartOfLine = true; - if ((BufferPtr + NumBytes) > BufferEnd) - return true; - BufferPtr += NumBytes; - return false; +void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) { + this->IsAtPhysicalStartOfLine = IsAtStartOfLine; + this->IsAtStartOfLine = IsAtStartOfLine; + assert((BufferStart + Offset) <= BufferEnd); + BufferPtr = BufferStart + Offset; } template <typename T> static void StringifyImpl(T &Str, char Quote) { Index: clang/lib/Lex/DependencyDirectivesScanner.cpp =================================================================== --- clang/lib/Lex/DependencyDirectivesScanner.cpp +++ clang/lib/Lex/DependencyDirectivesScanner.cpp @@ -18,83 +18,127 @@ #include "clang/Basic/CharInfo.h" #include "clang/Basic/Diagnostic.h" #include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/Lexer.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/Support/MemoryBuffer.h" -using namespace llvm; using namespace clang; using namespace clang::dependency_directives_scan; +using namespace llvm; namespace { -struct Scanner { - /// Minimized output. - SmallVectorImpl<char> &Out; - /// The known tokens encountered during the minimization. - SmallVectorImpl<Directive> &Directives; +struct DirectiveWithTokens { + DirectiveKind Kind; + unsigned NumTokens; - Scanner(SmallVectorImpl<char> &Out, SmallVectorImpl<Directive> &Directives, - StringRef Input, DiagnosticsEngine *Diags, - SourceLocation InputSourceLoc) - : Out(Out), Directives(Directives), Input(Input), Diags(Diags), - InputSourceLoc(InputSourceLoc) {} + DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens) + : Kind(Kind), NumTokens(NumTokens) {} +}; + +/// Does an efficient "scan" of the sources to detect the presence of +/// preprocessor (or module import) directives and collects the raw lexed tokens +/// for those directives so that the \p Lexer can "replay" them when the file is +/// included. +/// +/// Note that the behavior of the raw lexer is affected by the language mode, +/// while at this point we want to do a scan and collect tokens once, +/// irrespective of the language mode that the file will get included in. To +/// compensate for that the \p Lexer, while "replaying", will adjust a token +/// where appropriate, when it could affect the preprocessor's state. +/// For example in a directive like +/// +/// \code +/// #if __has_cpp_attribute(clang::fallthrough) +/// \endcode +/// +/// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2 +/// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon' +/// while in C++ mode. +struct Scanner { + Scanner(StringRef Input, + SmallVectorImpl<dependency_directives_scan::Token> &Tokens, + DiagnosticsEngine *Diags, SourceLocation InputSourceLoc) + : Input(Input), Tokens(Tokens), Diags(Diags), + InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()), + TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(), + Input.end()) {} + + static LangOptions getLangOptsForDepScanning() { + LangOptions LangOpts; + // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'. + LangOpts.ObjC = true; + LangOpts.LineComment = true; + return LangOpts; + } /// Lex the provided source and emit the directive tokens. /// /// \returns True on error. - bool scan(); + bool scan(SmallVectorImpl<Directive> &Directives); private: - struct IdInfo { - const char *Last; - StringRef Name; - }; + /// Lexes next token and advances \p First and the \p Lexer. + LLVM_NODISCARD dependency_directives_scan::Token & + lexToken(const char *&First, const char *const End); - /// Lex an identifier. + dependency_directives_scan::Token &lexIncludeFilename(const char *&First, + const char *const End); + + /// Lexes next token and if it is identifier returns its string, otherwise + /// it skips the current line and returns \p None. /// - /// \pre First points at a valid identifier head. - LLVM_NODISCARD IdInfo lexIdentifier(const char *First, const char *const End); - LLVM_NODISCARD bool isNextIdentifier(StringRef Id, const char *&First, - const char *const End); + /// In any case (whatever the token kind) \p First and the \p Lexer will + /// advance beyond the token. + LLVM_NODISCARD Optional<StringRef> + tryLexIdentifierOrSkipLine(const char *&First, const char *const End); + + /// Used when it is certain that next token is an identifier. + LLVM_NODISCARD StringRef lexIdentifier(const char *&First, + const char *const End); + + /// Lexes next token and returns true iff it is an identifier that matches \p + /// Id, otherwise it skips the current line and returns false. + /// + /// In any case (whatever the token kind) \p First and the \p Lexer will + /// advance beyond the token. + LLVM_NODISCARD bool isNextIdentifierOrSkipLine(StringRef Id, + const char *&First, + const char *const End); + LLVM_NODISCARD bool scanImpl(const char *First, const char *const End); LLVM_NODISCARD bool lexPPLine(const char *&First, const char *const End); LLVM_NODISCARD bool lexAt(const char *&First, const char *const End); LLVM_NODISCARD bool lexModule(const char *&First, const char *const End); - LLVM_NODISCARD bool lexDefine(const char *&First, const char *const End); + LLVM_NODISCARD bool lexDefine(const char *HashLoc, const char *&First, + const char *const End); LLVM_NODISCARD bool lexPragma(const char *&First, const char *const End); LLVM_NODISCARD bool lexEndif(const char *&First, const char *const End); - LLVM_NODISCARD bool lexDefault(DirectiveKind Kind, StringRef Directive, - const char *&First, const char *const End); - Directive &pushDirective(DirectiveKind K) { - Directives.emplace_back(K, Out.size()); - return Directives.back(); + LLVM_NODISCARD bool lexDefault(DirectiveKind Kind, const char *&First, + const char *const End); + LLVM_NODISCARD bool lexModuleDirectiveBody(DirectiveKind Kind, + const char *&First, + const char *const End); + void lexPPDirectiveBody(const char *&First, const char *const End); + + DirectiveWithTokens &pushDirective(DirectiveKind Kind) { + Tokens.append(CurDirToks); + DirsWithToks.emplace_back(Kind, CurDirToks.size()); + CurDirToks.clear(); + return DirsWithToks.back(); } void popDirective() { - Out.resize(Directives.back().Offset); - Directives.pop_back(); + Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens); } DirectiveKind topDirective() const { - return Directives.empty() ? pp_none : Directives.back().Kind; + return DirsWithToks.empty() ? pp_none : DirsWithToks.back().Kind; } - Scanner &put(char Byte) { - Out.push_back(Byte); - return *this; + unsigned getOffsetAt(const char *CurPtr) const { + return CurPtr - Input.data(); } - Scanner &append(StringRef S) { return append(S.begin(), S.end()); } - Scanner &append(const char *First, const char *Last) { - Out.append(First, Last); - return *this; - } - - void printToNewline(const char *&First, const char *const End); - void printAdjacentModuleNameParts(const char *&First, const char *const End); - LLVM_NODISCARD bool printAtImportBody(const char *&First, - const char *const End); - void printDirectiveBody(const char *&First, const char *const End); - void printAdjacentMacroArgs(const char *&First, const char *const End); - LLVM_NODISCARD bool printMacroArgs(const char *&First, const char *const End); /// Reports a diagnostic if the diagnostic engine is provided. Always returns /// true at the end. @@ -102,8 +146,14 @@ StringMap<char> SplitIds; StringRef Input; + SmallVectorImpl<dependency_directives_scan::Token> &Tokens; DiagnosticsEngine *Diags; SourceLocation InputSourceLoc; + + SmallVector<dependency_directives_scan::Token, 32> CurDirToks; + SmallVector<DirectiveWithTokens, 64> DirsWithToks; + LangOptions LangOpts; + Lexer TheLexer; }; } // end anonymous namespace @@ -112,7 +162,7 @@ if (!Diags) return true; assert(CurPtr >= Input.data() && "invalid buffer ptr"); - Diags->Report(InputSourceLoc.getLocWithOffset(CurPtr - Input.data()), Err); + Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err); return true; } @@ -265,30 +315,6 @@ } } -static const char *findLastNonSpace(const char *First, const char *Last) { - assert(First <= Last); - while (First != Last && isHorizontalWhitespace(Last[-1])) - --Last; - return Last; -} - -static const char *findLastNonSpaceNonBackslash(const char *First, - const char *Last) { - assert(First <= Last); - while (First != Last && - (isHorizontalWhitespace(Last[-1]) || Last[-1] == '\\')) - --Last; - return Last; -} - -static const char *findFirstTrailingSpace(const char *First, const char *Last) { - const char *LastNonSpace = findLastNonSpace(First, Last); - if (Last == LastNonSpace) - return Last; - assert(isHorizontalWhitespace(LastNonSpace[0])); - return LastNonSpace + 1; -} - static void skipLineComment(const char *&First, const char *const End) { assert(First[0] == '/' && First[1] == '/'); First += 2; @@ -396,67 +422,6 @@ skipLine(First, End); } -void Scanner::printToNewline(const char *&First, const char *const End) { - while (First != End && !isVerticalWhitespace(*First)) { - const char *Last = First; - do { - // Iterate over strings correctly to avoid comments and newlines. - if (*Last == '"' || *Last == '\'' || - (*Last == '<' && - (topDirective() == pp_include || topDirective() == pp_import))) { - if (LLVM_UNLIKELY(isRawStringLiteral(First, Last))) - skipRawString(Last, End); - else - skipString(Last, End); - continue; - } - if (*Last != '/' || End - Last < 2) { - ++Last; - continue; // Gather the rest up to print verbatim. - } - - if (Last[1] != '/' && Last[1] != '*') { - ++Last; - continue; - } - - // Deal with "//..." and "/*...*/". - append(First, findFirstTrailingSpace(First, Last)); - First = Last; - - if (Last[1] == '/') { - skipLineComment(First, End); - return; - } - - put(' '); - skipBlockComment(First, End); - skipOverSpaces(First, End); - Last = First; - } while (Last != End && !isVerticalWhitespace(*Last)); - - // Print out the string. - const char *LastBeforeTrailingSpace = findLastNonSpace(First, Last); - if (Last == End || LastBeforeTrailingSpace == First || - LastBeforeTrailingSpace[-1] != '\\') { - append(First, LastBeforeTrailingSpace); - First = Last; - skipNewline(First, End); - return; - } - - // Print up to the last character that's not a whitespace or backslash. - // Then print exactly one space, which matters when tokens are separated by - // a line continuation. - append(First, findLastNonSpaceNonBackslash(First, Last)); - put(' '); - - First = Last; - skipNewline(First, End); - skipOverSpaces(First, End); - } -} - static void skipWhitespace(const char *&First, const char *const End) { for (;;) { assert(First <= End); @@ -489,176 +454,134 @@ } } -void Scanner::printAdjacentModuleNameParts(const char *&First, - const char *const End) { - // Skip over parts of the body. - const char *Last = First; - do - ++Last; - while (Last != End && (isAsciiIdentifierContinue(*Last) || *Last == '.')); - append(First, Last); - First = Last; -} - -bool Scanner::printAtImportBody(const char *&First, const char *const End) { +bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First, + const char *const End) { + const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset; for (;;) { - skipWhitespace(First, End); - if (First == End) - return true; - - if (isVerticalWhitespace(*First)) { - skipNewline(First, End); - continue; - } - - // Found a semicolon. - if (*First == ';') { - put(*First++).put('\n'); - return false; - } - - // Don't handle macro expansions inside @import for now. - if (!isAsciiIdentifierContinue(*First) && *First != '.') - return true; - - printAdjacentModuleNameParts(First, End); + const dependency_directives_scan::Token &Tok = lexToken(First, End); + if (Tok.is(tok::eof)) + return reportError( + DirectiveLoc, + diag::err_dep_source_scanner_missing_semi_after_at_import); + if (Tok.is(tok::semi)) + break; } + pushDirective(Kind); + skipWhitespace(First, End); + if (First == End) + return false; + if (!isVerticalWhitespace(*First)) + return reportError( + DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import); + skipNewline(First, End); + return false; } -void Scanner::printDirectiveBody(const char *&First, const char *const End) { - skipWhitespace(First, End); // Skip initial whitespace. - printToNewline(First, End); - while (Out.back() == ' ') - Out.pop_back(); - put('\n'); -} +dependency_directives_scan::Token &Scanner::lexToken(const char *&First, + const char *const End) { + clang::Token Tok; + TheLexer.LexFromRawLexer(Tok); + First = Input.data() + TheLexer.getCurrentBufferOffset(); + assert(First <= End); -LLVM_NODISCARD static const char *lexRawIdentifier(const char *First, - const char *const End) { - assert(isAsciiIdentifierContinue(*First) && "invalid identifer"); - const char *Last = First + 1; - while (Last != End && isAsciiIdentifierContinue(*Last)) - ++Last; - return Last; + unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength(); + CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(), + Tok.getFlags()); + return CurDirToks.back(); } -LLVM_NODISCARD static const char * -getIdentifierContinuation(const char *First, const char *const End) { - if (End - First < 3 || First[0] != '\\' || !isVerticalWhitespace(First[1])) - return nullptr; +dependency_directives_scan::Token & +Scanner::lexIncludeFilename(const char *&First, const char *const End) { + clang::Token Tok; + TheLexer.LexIncludeFilename(Tok); + First = Input.data() + TheLexer.getCurrentBufferOffset(); + assert(First <= End); - ++First; - skipNewline(First, End); - if (First == End) - return nullptr; - return isAsciiIdentifierContinue(First[0]) ? First : nullptr; -} - -Scanner::IdInfo Scanner::lexIdentifier(const char *First, - const char *const End) { - const char *Last = lexRawIdentifier(First, End); - const char *Next = getIdentifierContinuation(Last, End); - if (LLVM_LIKELY(!Next)) - return IdInfo{Last, StringRef(First, Last - First)}; - - // Slow path, where identifiers are split over lines. - SmallVector<char, 64> Id(First, Last); - while (Next) { - Last = lexRawIdentifier(Next, End); - Id.append(Next, Last); - Next = getIdentifierContinuation(Last, End); - } - return IdInfo{ - Last, - SplitIds.try_emplace(StringRef(Id.begin(), Id.size()), 0).first->first()}; + unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength(); + CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(), + Tok.getFlags()); + return CurDirToks.back(); } -void Scanner::printAdjacentMacroArgs(const char *&First, - const char *const End) { - // Skip over parts of the body. - const char *Last = First; - do - ++Last; - while (Last != End && - (isAsciiIdentifierContinue(*Last) || *Last == '.' || *Last == ',')); - append(First, Last); - First = Last; +void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) { + while (true) { + const dependency_directives_scan::Token &Tok = lexToken(First, End); + if (Tok.is(tok::eod)) + break; + } } -bool Scanner::printMacroArgs(const char *&First, const char *const End) { - assert(*First == '('); - put(*First++); - for (;;) { - skipWhitespace(First, End); - if (First == End) - return true; +LLVM_NODISCARD Optional<StringRef> +Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) { + const dependency_directives_scan::Token &Tok = lexToken(First, End); + if (Tok.isNot(tok::raw_identifier)) { + if (!Tok.is(tok::eod)) + skipLine(First, End); + return None; + } - if (*First == ')') { - put(*First++); - return false; - } + bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning; + if (LLVM_LIKELY(!NeedsCleaning)) + return Input.slice(Tok.Offset, Tok.getEnd()); - // This is intentionally fairly liberal. - if (!(isAsciiIdentifierContinue(*First) || *First == '.' || *First == ',')) - return true; + SmallString<64> Spelling; + Spelling.resize(Tok.Length); - printAdjacentMacroArgs(First, End); + unsigned SpellingLength = 0; + const char *BufPtr = Input.begin() + Tok.Offset; + const char *AfterIdent = Input.begin() + Tok.getEnd(); + while (BufPtr < AfterIdent) { + unsigned Size; + Spelling[SpellingLength++] = + Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); + BufPtr += Size; } + + return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0) + .first->first(); } -/// Looks for an identifier starting from Last. -/// -/// Updates "First" to just past the next identifier, if any. Returns true iff -/// the identifier matches "Id". -bool Scanner::isNextIdentifier(StringRef Id, const char *&First, - const char *const End) { - skipWhitespace(First, End); - if (First == End || !isAsciiIdentifierStart(*First)) - return false; +StringRef Scanner::lexIdentifier(const char *&First, const char *const End) { + Optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End); + assert(Id.hasValue() && "expected identifier token"); + return Id.getValue(); +} - IdInfo FoundId = lexIdentifier(First, End); - First = FoundId.Last; - return FoundId.Name == Id; +bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First, + const char *const End) { + if (Optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End)) { + if (*FoundId == Id) + return true; + skipLine(First, End); + } + return false; } bool Scanner::lexAt(const char *&First, const char *const End) { // Handle "@import". - const char *ImportLoc = First++; - if (!isNextIdentifier("import", First, End)) { - skipLine(First, End); - return false; - } - pushDirective(decl_at_import); - append("@import "); - if (printAtImportBody(First, End)) - return reportError( - ImportLoc, diag::err_dep_source_scanner_missing_semi_after_at_import); - skipWhitespace(First, End); - if (First == End) + + // Lex '@'. + const dependency_directives_scan::Token &AtTok = lexToken(First, End); + assert(AtTok.is(tok::at)); + (void)AtTok; + + if (!isNextIdentifierOrSkipLine("import", First, End)) return false; - if (!isVerticalWhitespace(*First)) - return reportError( - ImportLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import); - skipNewline(First, End); - return false; + return lexModuleDirectiveBody(decl_at_import, First, End); } bool Scanner::lexModule(const char *&First, const char *const End) { - IdInfo Id = lexIdentifier(First, End); - First = Id.Last; + StringRef Id = lexIdentifier(First, End); bool Export = false; - if (Id.Name == "export") { + if (Id == "export") { Export = true; - skipWhitespace(First, End); - if (!isAsciiIdentifierContinue(*First)) { - skipLine(First, End); + Optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End); + if (!NextId) return false; - } - Id = lexIdentifier(First, End); - First = Id.Last; + Id = *NextId; } - if (Id.Name != "module" && Id.Name != "import") { + if (Id != "module" && Id != "import") { skipLine(First, End); return false; } @@ -680,114 +603,51 @@ } } - if (Export) { - pushDirective(cxx_export_decl); - append("export "); - } + TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false); - if (Id.Name == "module") - pushDirective(cxx_module_decl); + DirectiveKind Kind; + if (Id == "module") + Kind = Export ? cxx_export_module_decl : cxx_module_decl; else - pushDirective(cxx_import_decl); - append(Id.Name); - append(" "); - printToNewline(First, End); - append("\n"); - return false; -} - -bool Scanner::lexDefine(const char *&First, const char *const End) { - pushDirective(pp_define); - append("#define "); - skipWhitespace(First, End); - - if (!isAsciiIdentifierStart(*First)) - return reportError(First, diag::err_pp_macro_not_identifier); + Kind = Export ? cxx_export_import_decl : cxx_import_decl; - IdInfo Id = lexIdentifier(First, End); - const char *Last = Id.Last; - append(Id.Name); - if (Last == End) - return false; - if (*Last == '(') { - size_t Size = Out.size(); - if (printMacroArgs(Last, End)) { - // Be robust to bad macro arguments, since they can show up in disabled - // code. - Out.resize(Size); - append("(/* invalid */\n"); - skipLine(Last, End); - return false; - } - } - skipWhitespace(Last, End); - if (Last == End) - return false; - if (!isVerticalWhitespace(*Last)) - put(' '); - printDirectiveBody(Last, End); - First = Last; - return false; + return lexModuleDirectiveBody(Kind, First, End); } bool Scanner::lexPragma(const char *&First, const char *const End) { - // #pragma. - skipWhitespace(First, End); - if (First == End || !isAsciiIdentifierStart(*First)) + Optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End); + if (!FoundId) return false; - IdInfo FoundId = lexIdentifier(First, End); - First = FoundId.Last; - if (FoundId.Name == "once") { - // #pragma once - skipLine(First, End); - pushDirective(pp_pragma_once); - append("#pragma once\n"); - return false; - } - if (FoundId.Name == "push_macro") { - // #pragma push_macro - pushDirective(pp_pragma_push_macro); - append("#pragma push_macro"); - printDirectiveBody(First, End); - return false; - } - if (FoundId.Name == "pop_macro") { - // #pragma pop_macro - pushDirective(pp_pragma_pop_macro); - append("#pragma pop_macro"); - printDirectiveBody(First, End); - return false; - } - if (FoundId.Name == "include_alias") { - // #pragma include_alias - pushDirective(pp_pragma_include_alias); - append("#pragma include_alias"); - printDirectiveBody(First, End); + StringRef Id = FoundId.getValue(); + auto Kind = llvm::StringSwitch<DirectiveKind>(Id) + .Case("once", pp_pragma_once) + .Case("push_macro", pp_pragma_push_macro) + .Case("pop_macro", pp_pragma_pop_macro) + .Case("include_alias", pp_pragma_include_alias) + .Default(pp_none); + if (Kind != pp_none) { + lexPPDirectiveBody(First, End); + pushDirective(Kind); return false; } - if (FoundId.Name != "clang") { + if (Id != "clang") { skipLine(First, End); return false; } // #pragma clang. - if (!isNextIdentifier("module", First, End)) { - skipLine(First, End); + if (!isNextIdentifierOrSkipLine("module", First, End)) return false; - } // #pragma clang module. - if (!isNextIdentifier("import", First, End)) { - skipLine(First, End); + if (!isNextIdentifierOrSkipLine("import", First, End)) return false; - } // #pragma clang module import. + lexPPDirectiveBody(First, End); pushDirective(pp_pragma_import); - append("#pragma clang module import "); - printDirectiveBody(First, End); return false; } @@ -808,14 +668,13 @@ return false; } - return lexDefault(pp_endif, "endif", First, End); + return lexDefault(pp_endif, First, End); } -bool Scanner::lexDefault(DirectiveKind Kind, StringRef Directive, - const char *&First, const char *const End) { +bool Scanner::lexDefault(DirectiveKind Kind, const char *&First, + const char *const End) { + lexPPDirectiveBody(First, End); pushDirective(Kind); - put('#').append(Directive).put(' '); - printDirectiveBody(First, End); return false; } @@ -845,6 +704,14 @@ return false; } + TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true); + + auto ScEx1 = make_scope_exit([&]() { + /// Clear Scanner's CurDirToks before returning, in case we didn't push a + /// new directive. + CurDirToks.clear(); + }); + // Handle "@import". if (*First == '@') return lexAt(First, End); @@ -853,25 +720,26 @@ return lexModule(First, End); // Handle preprocessing directives. - ++First; // Skip over '#'. - skipWhitespace(First, End); - if (First == End) - return reportError(First, diag::err_pp_expected_eol); + TheLexer.setParsingPreprocessorDirective(true); + auto ScEx2 = make_scope_exit( + [&]() { TheLexer.setParsingPreprocessorDirective(false); }); - if (!isAsciiIdentifierStart(*First)) { - skipLine(First, End); + // Lex '#'. + const dependency_directives_scan::Token &HashTok = lexToken(First, End); + assert(HashTok.is(tok::hash)); + (void)HashTok; + + Optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End); + if (!FoundId) return false; - } - // Figure out the token. - IdInfo Id = lexIdentifier(First, End); - First = Id.Last; + StringRef Id = FoundId.getValue(); - if (Id.Name == "pragma") + if (Id == "pragma") return lexPragma(First, End); - auto Kind = llvm::StringSwitch<DirectiveKind>(Id.Name) + auto Kind = llvm::StringSwitch<DirectiveKind>(Id) .Case("include", pp_include) .Case("__include_macros", pp___include_macros) .Case("define", pp_define) @@ -888,18 +756,26 @@ .Case("endif", pp_endif) .Default(pp_none); if (Kind == pp_none) { - skipDirective(Id.Name, First, End); + skipDirective(Id, First, End); return false; } if (Kind == pp_endif) return lexEndif(First, End); - if (Kind == pp_define) - return lexDefine(First, End); + switch (Kind) { + case pp_include: + case pp___include_macros: + case pp_include_next: + case pp_import: + lexIncludeFilename(First, End); + break; + default: + break; + } // Everything else. - return lexDefault(Kind, Id.Name, First, End); + return lexDefault(Kind, First, End); } static void skipUTF8ByteOrderMark(const char *&First, const char *const End) { @@ -916,28 +792,65 @@ return false; } -bool Scanner::scan() { +bool Scanner::scan(SmallVectorImpl<Directive> &Directives) { bool Error = scanImpl(Input.begin(), Input.end()); if (!Error) { - // Add a trailing newline and an EOF on success. - if (!Out.empty() && Out.back() != '\n') - Out.push_back('\n'); + // Add an EOF on success. pushDirective(pp_eof); } - // Null-terminate the output. This way the memory buffer that's passed to - // Clang will not have to worry about the terminating '\0'. - Out.push_back(0); - Out.pop_back(); + ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens; + for (const DirectiveWithTokens &DirWithToks : DirsWithToks) { + assert(RemainingTokens.size() >= DirWithToks.NumTokens); + Directives.emplace_back(DirWithToks.Kind, + RemainingTokens.take_front(DirWithToks.NumTokens)); + RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens); + } + assert(RemainingTokens.empty()); + return Error; } bool clang::scanSourceForDependencyDirectives( - StringRef Input, SmallVectorImpl<char> &Output, + StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens, SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags, SourceLocation InputSourceLoc) { - Output.clear(); - Directives.clear(); - return Scanner(Output, Directives, Input, Diags, InputSourceLoc).scan(); + return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives); +} + +void clang::printDependencyDirectivesAsSource( + StringRef Source, + ArrayRef<dependency_directives_scan::Directive> Directives, + llvm::raw_ostream &OS) { + // Add a space separator where it is convenient for testing purposes. + auto needsSpaceSeparator = + [](tok::TokenKind Prev, + const dependency_directives_scan::Token &Tok) -> bool { + if (Prev == Tok.Kind) + return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square, + tok::r_square); + if (Prev == tok::raw_identifier && + Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal, + tok::char_constant, tok::header_name)) + return true; + if (Prev == tok::r_paren && + Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal, + tok::char_constant, tok::unknown)) + return true; + if (Prev == tok::comma && + Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less)) + return true; + return false; + }; + + for (const dependency_directives_scan::Directive &Directive : Directives) { + Optional<tok::TokenKind> PrevTokenKind; + for (const dependency_directives_scan::Token &Tok : Directive.Tokens) { + if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok)) + OS << ' '; + PrevTokenKind = Tok.Kind; + OS << Source.slice(Tok.Offset, Tok.getEnd()); + } + } } Index: clang/lib/Frontend/FrontendActions.cpp =================================================================== --- clang/lib/Frontend/FrontendActions.cpp +++ clang/lib/Frontend/FrontendActions.cpp @@ -1157,10 +1157,10 @@ SourceManager &SM = CI.getPreprocessor().getSourceManager(); llvm::MemoryBufferRef FromFile = SM.getBufferOrFake(SM.getMainFileID()); - llvm::SmallString<1024> Output; + llvm::SmallVector<dependency_directives_scan::Token, 16> Tokens; llvm::SmallVector<dependency_directives_scan::Directive, 32> Directives; if (scanSourceForDependencyDirectives( - FromFile.getBuffer(), Output, Directives, &CI.getDiagnostics(), + FromFile.getBuffer(), Tokens, Directives, &CI.getDiagnostics(), SM.getLocForStartOfFile(SM.getMainFileID()))) { assert(CI.getDiagnostics().hasErrorOccurred() && "no errors reported for failure"); @@ -1179,7 +1179,8 @@ } return; } - llvm::outs() << Output; + printDependencyDirectivesAsSource(FromFile.getBuffer(), Directives, + llvm::outs()); } void GetDependenciesByModuleNameAction::ExecuteAction() { Index: clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h =================================================================== --- clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h +++ clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h @@ -10,6 +10,7 @@ #define LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGFILESYSTEM_H #include "clang/Basic/LLVM.h" +#include "clang/Lex/DependencyDirectivesScanner.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringMap.h" #include "llvm/Support/Allocator.h" @@ -21,21 +22,26 @@ namespace tooling { namespace dependencies { -/// Original and minimized contents of a cached file entry. Single instance can +using DependencyDirectivesTy = + SmallVector<dependency_directives_scan::Directive, 20>; + +/// Contents and directive tokens of a cached file entry. Single instance can /// be shared between multiple entries. struct CachedFileContents { - CachedFileContents(std::unique_ptr<llvm::MemoryBuffer> Original) - : Original(std::move(Original)), MinimizedAccess(nullptr) {} + CachedFileContents(std::unique_ptr<llvm::MemoryBuffer> Contents) + : Original(std::move(Contents)), DepDirectives(nullptr) {} /// Owning storage for the original contents. std::unique_ptr<llvm::MemoryBuffer> Original; /// The mutex that must be locked before mutating directive tokens. std::mutex ValueLock; - /// Owning storage for the minimized contents. - std::unique_ptr<llvm::MemoryBuffer> MinimizedStorage; + SmallVector<dependency_directives_scan::Token, 10> DepDirectiveTokens; /// Accessor to the directive tokens that's atomic to avoid data races. - std::atomic<llvm::MemoryBuffer *> MinimizedAccess; + /// \p CachedFileContents has ownership of the pointer. + std::atomic<const Optional<DependencyDirectivesTy> *> DepDirectives; + + ~CachedFileContents() { delete DepDirectives.load(); } }; /// An in-memory representation of a file system entity that is of interest to @@ -82,13 +88,17 @@ /// \returns The scanned preprocessor directive tokens of the file that are /// used to speed up preprocessing, if available. - StringRef getDirectiveTokens() const { + Optional<ArrayRef<dependency_directives_scan::Directive>> + getDirectiveTokens() const { assert(!isError() && "error"); - assert(!MaybeStat->isDirectory() && "not a file"); + assert(!isDirectory() && "not a file"); assert(Contents && "contents not initialized"); - llvm::MemoryBuffer *Buffer = Contents->MinimizedAccess.load(); - assert(Buffer && "not minimized"); - return Buffer->getBuffer(); + if (auto *Directives = Contents->DepDirectives.load()) { + if (Directives->hasValue()) + return ArrayRef<dependency_directives_scan::Directive>( + Directives->getValue()); + } + return None; } /// \returns The error. @@ -224,10 +234,6 @@ /// If the underlying entry is an opened file, this wrapper returns the file /// contents and the scanned preprocessor directives. class EntryRef { - /// For entry that is an opened file, this bit signifies whether its contents - /// are minimized. - bool Minimized; - /// The filename used to access this entry. std::string Filename; @@ -235,8 +241,8 @@ const CachedFileSystemEntry &Entry; public: - EntryRef(bool Minimized, StringRef Name, const CachedFileSystemEntry &Entry) - : Minimized(Minimized), Filename(Name), Entry(Entry) {} + EntryRef(StringRef Name, const CachedFileSystemEntry &Entry) + : Filename(Name), Entry(Entry) {} llvm::vfs::Status getStatus() const { llvm::vfs::Status Stat = Entry.getStatus(); @@ -255,8 +261,11 @@ return *this; } - StringRef getContents() const { - return Minimized ? Entry.getDirectiveTokens() : Entry.getOriginalContents(); + StringRef getContents() const { return Entry.getOriginalContents(); } + + Optional<ArrayRef<dependency_directives_scan::Directive>> + getDirectiveTokens() const { + return Entry.getDirectiveTokens(); } }; Index: clang/include/clang/Lex/Lexer.h =================================================================== --- clang/include/clang/Lex/Lexer.h +++ clang/include/clang/Lex/Lexer.h @@ -288,14 +288,8 @@ return BufferPtr - BufferStart; } - /// Skip over \p NumBytes bytes. - /// - /// If the skip is successful, the next token will be lexed from the new - /// offset. The lexer also assumes that we skipped to the start of the line. - /// - /// \returns true if the skip failed (new offset would have been past the - /// end of the buffer), false otherwise. - bool skipOver(unsigned NumBytes); + /// Set the lexer's buffer pointer to \p Offset. + void seek(unsigned Offset, bool IsAtStartOfLine); /// Stringify - Convert the specified string into a C string by i) escaping /// '\\' and " characters and ii) replacing newline character(s) with "\\n". Index: clang/include/clang/Lex/DependencyDirectivesScanner.h =================================================================== --- clang/include/clang/Lex/DependencyDirectivesScanner.h +++ clang/include/clang/Lex/DependencyDirectivesScanner.h @@ -19,15 +19,41 @@ #include "clang/Basic/SourceLocation.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" namespace clang { +namespace tok { +enum TokenKind : unsigned short; +} + class DiagnosticsEngine; namespace dependency_directives_scan { +/// Token lexed as part of dependency directive scanning. +struct Token { + /// Offset into the original source input. + unsigned Offset; + unsigned Length; + tok::TokenKind Kind; + unsigned short Flags; + + Token(unsigned Offset, unsigned Length, tok::TokenKind Kind, + unsigned short Flags) + : Offset(Offset), Length(Length), Kind(Kind), Flags(Flags) {} + + unsigned getEnd() const { return Offset + Length; } + + bool is(tok::TokenKind K) const { return Kind == K; } + bool isNot(tok::TokenKind K) const { return Kind != K; } + bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const { + return is(K1) || is(K2); + } + template <typename... Ts> bool isOneOf(tok::TokenKind K1, Ts... Ks) const { + return is(K1) || isOneOf(Ks...); + } +}; + /// Represents the kind of preprocessor directive or a module declaration that /// is tracked by the scanner in its token output. enum DirectiveKind : uint8_t { @@ -52,9 +78,10 @@ pp_else, pp_endif, decl_at_import, - cxx_export_decl, cxx_module_decl, cxx_import_decl, + cxx_export_module_decl, + cxx_export_import_decl, pp_eof, }; @@ -62,35 +89,48 @@ /// scanning. It's used to track various preprocessor directives that could /// potentially have an effect on the depedencies. struct Directive { + ArrayRef<Token> Tokens; + /// The kind of token. DirectiveKind Kind = pp_none; - /// Offset into the output byte stream of where the directive begins. - int Offset = -1; - - Directive(DirectiveKind K, int Offset) : Kind(K), Offset(Offset) {} + Directive() = default; + Directive(DirectiveKind K, ArrayRef<Token> Tokens) + : Tokens(Tokens), Kind(K) {} }; } // end namespace dependency_directives_scan -/// Minimize the input down to the preprocessor directives that might have +/// Scan the input for the preprocessor directives that might have /// an effect on the dependencies for a compilation unit. /// -/// This function deletes all non-preprocessor code, and strips anything that -/// can't affect what gets included. It canonicalizes whitespace where -/// convenient to stabilize the output against formatting changes in the input. -/// -/// Clears the output vectors at the beginning of the call. +/// This function ignores all non-preprocessor code and anything that +/// can't affect what gets included. /// /// \returns false on success, true on error. If the diagnostic engine is not /// null, an appropriate error is reported using the given input location -/// with the offset that corresponds to the minimizer's current buffer offset. +/// with the offset that corresponds to the \p Input buffer offset. bool scanSourceForDependencyDirectives( - llvm::StringRef Input, llvm::SmallVectorImpl<char> &Output, - llvm::SmallVectorImpl<dependency_directives_scan::Directive> &Directives, + StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens, + SmallVectorImpl<dependency_directives_scan::Directive> &Directives, DiagnosticsEngine *Diags = nullptr, SourceLocation InputSourceLoc = SourceLocation()); +/// Print the previously scanned dependency directives as minimized source text. +/// +/// \param Source The original source text that the dependency directives were +/// scanned from. +/// \param Directives The previously scanned dependency +/// directives. +/// \param OS the stream to print the dependency directives on. +/// +/// This is used primarily for testing purposes, during dependency scanning the +/// \p Lexer uses the tokens directly, not their printed version. +void printDependencyDirectivesAsSource( + StringRef Source, + ArrayRef<dependency_directives_scan::Directive> Directives, + llvm::raw_ostream &OS); + } // end namespace clang #endif // LLVM_CLANG_LEX_DEPENDENCYDIRECTIVESSCANNER_H
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits