https://github.com/tru updated https://github.com/llvm/llvm-project/pull/139504
>From 859723743a51bc75d855b4ee924ea5cf86e26183 Mon Sep 17 00:00:00 2001 From: Tobias Hieta <tobias.hi...@ubisoft.com> Date: Mon, 5 May 2025 11:40:17 +0200 Subject: [PATCH 1/4] [clang][scandeps] Improve handling of rawstrings. The current parser just checks one step back for a R before each string to know that it's a rawstring. But the preprocessor is much more advanced here and can have constructs like: R\ "str" And much more. This patch also adds more test coverage for Rawstrings in the dependencydirectivesscanner. This was co-authored by Sylvain Audi <sylvain.a...@ubisoft.com> Fixes #137648 --- clang/lib/Lex/DependencyDirectivesScanner.cpp | 43 ++++++++++++--- clang/test/ClangScanDeps/raw-strings.cpp | 55 +++++++++++++++++++ 2 files changed, 91 insertions(+), 7 deletions(-) create mode 100644 clang/test/ClangScanDeps/raw-strings.cpp diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp index 1b6b16c561141..bc936ac749a78 100644 --- a/clang/lib/Lex/DependencyDirectivesScanner.cpp +++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp @@ -206,6 +206,24 @@ static void skipOverSpaces(const char *&First, const char *const End) { ++First; } +// Move back by one character, skipping escaped newlines (backslash + \n) +static char previousChar(const char *First, const char *&Current) { + assert(Current > First); + --Current; + while (Current > First + 1 && isVerticalWhitespace(*Current)) { + const char PrevChar = *(Current - 1); + if (PrevChar == '\\') { + Current -= 2; // backslash + (\n or \r) + } else if (Current > First + 2 && isVerticalWhitespace(PrevChar) && + PrevChar != *Current && *(Current - 2) == '\\') { + Current -= 3; // backslash + (\n\r or \r\n) + } else { + break; + } + } + return *Current; +} + [[nodiscard]] static bool isRawStringLiteral(const char *First, const char *Current) { assert(First <= Current); @@ -215,25 +233,28 @@ static void skipOverSpaces(const char *&First, const char *const End) { return false; // Check for an "R". - --Current; - if (*Current != 'R') + if (previousChar(First, Current) != 'R') return false; - if (First == Current || !isAsciiIdentifierContinue(*--Current)) + if (First == Current || + !isAsciiIdentifierContinue(previousChar(First, Current))) return true; // Check for a prefix of "u", "U", or "L". if (*Current == 'u' || *Current == 'U' || *Current == 'L') - return First == Current || !isAsciiIdentifierContinue(*--Current); + return First == Current || + !isAsciiIdentifierContinue(previousChar(First, Current)); // Check for a prefix of "u8". - if (*Current != '8' || First == Current || *Current-- != 'u') + if (*Current != '8' || First == Current || + previousChar(First, Current) != 'u') return false; - return First == Current || !isAsciiIdentifierContinue(*--Current); + return First == Current || + !isAsciiIdentifierContinue(previousChar(First, Current)); } static void skipRawString(const char *&First, const char *const End) { assert(First[0] == '"'); - assert(First[-1] == 'R'); + //assert(First[-1] == 'R'); const char *Last = ++First; while (Last != End && *Last != '(') @@ -416,6 +437,14 @@ void Scanner::skipLine(const char *&First, const char *const End) { continue; } + // Continue on the same line if an EOL is preceded with backslash + if (First + 1 < End && *First == '\\') { + if (unsigned Len = isEOL(First + 1, End)) { + First += 1 + Len; + continue; + } + } + // Iterate over comments correctly. if (*First != '/' || End - First < 2) { LastTokenPtr = First; diff --git a/clang/test/ClangScanDeps/raw-strings.cpp b/clang/test/ClangScanDeps/raw-strings.cpp new file mode 100644 index 0000000000000..5fda4a559c9e3 --- /dev/null +++ b/clang/test/ClangScanDeps/raw-strings.cpp @@ -0,0 +1,55 @@ +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.in > %t/cdb.json + +//--- cdb.json.in +[{ + "directory": "DIR", + "command": "clang -c DIR/tu.c -o DIR/tu.o -IDIR/include", + "file": "DIR/tu.c" +}] +//--- include/header.h +//--- include/header2.h +//--- include/header3.h +//--- include/header4.h +//--- tu.c +#if 0 +R"x()x" +#endif + +#include "header.h" + +#if 0 +R"y("; +#endif +#include "header2.h" + +#if 0 +//")y" +#endif + +#if 0 +R"y("; +R"z()y"; +#endif +#include "header3.h" +#if 0 +//")z" +#endif + +#if 0 +R\ +"y("; +R"z()y"; +#endif +#include "header4.h" +#if 0 +//")z" +#endif + +// RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess | FileCheck %s +// RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess-dependency-directives | FileCheck %s +// CHECK: tu.c +// CHECK-NEXT: header.h +// CHECK-NEXT: header3.h +// CHECK-NEXT: header4.h >From 7baebfba89ab37bfe94ed2743ef6e4af9a932250 Mon Sep 17 00:00:00 2001 From: Tobias Hieta <tobias.hi...@ubisoft.com> Date: Mon, 23 Jun 2025 11:09:08 +0200 Subject: [PATCH 2/4] [clang][lexer] Make Lexer::getEscapedNewLineSize public This is so that we can use it in the dependency scanner without duplicating the logic there. --- clang/include/clang/Lex/Lexer.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h index ca812ba1583fb..0ab192c437040 100644 --- a/clang/include/clang/Lex/Lexer.h +++ b/clang/include/clang/Lex/Lexer.h @@ -585,6 +585,11 @@ class Lexer : public PreprocessorLexer { /// sequence. static bool isNewLineEscaped(const char *BufferStart, const char *Str); + /// getEscapedNewLineSize - Return the size of the specified escaped newline, + /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry + /// to this function. + static unsigned getEscapedNewLineSize(const char *P); + /// Diagnose use of a delimited or named escape sequence. static void DiagnoseDelimitedOrNamedEscapeSequence(SourceLocation Loc, bool Named, @@ -725,11 +730,6 @@ class Lexer : public PreprocessorLexer { /// method. SizedChar getCharAndSizeSlow(const char *Ptr, Token *Tok = nullptr); - /// getEscapedNewLineSize - Return the size of the specified escaped newline, - /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry - /// to this function. - static unsigned getEscapedNewLineSize(const char *P); - /// SkipEscapedNewLines - If P points to an escaped newline (or a series of /// them), skip over them and return the first non-escaped-newline found, /// otherwise return P. >From 63ca11677ab786f77f42aa7dd78871d798f9c4cd Mon Sep 17 00:00:00 2001 From: Tobias Hieta <tobias.hi...@ubisoft.com> Date: Mon, 23 Jun 2025 11:10:31 +0200 Subject: [PATCH 3/4] Improvements to raw-string parsing and expanding tests As suggested in the review, use Lexer::getEscapedNewLineSize() instead of implementing our own logic for it. --- clang/lib/Lex/DependencyDirectivesScanner.cpp | 18 ++-- clang/test/ClangScanDeps/raw-strings.cpp | 93 +++++++++++++++++++ 2 files changed, 104 insertions(+), 7 deletions(-) diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp index bc936ac749a78..eb10cfe4cbf45 100644 --- a/clang/lib/Lex/DependencyDirectivesScanner.cpp +++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp @@ -210,13 +210,17 @@ static void skipOverSpaces(const char *&First, const char *const End) { static char previousChar(const char *First, const char *&Current) { assert(Current > First); --Current; - while (Current > First + 1 && isVerticalWhitespace(*Current)) { - const char PrevChar = *(Current - 1); - if (PrevChar == '\\') { - Current -= 2; // backslash + (\n or \r) - } else if (Current > First + 2 && isVerticalWhitespace(PrevChar) && - PrevChar != *Current && *(Current - 2) == '\\') { - Current -= 3; // backslash + (\n\r or \r\n) + while (Current > First && isVerticalWhitespace(*Current)) { + // Check if the previous character is a backslash + if (Current > First && *(Current - 1) == '\\') { + // Use Lexer's getEscapedNewLineSize to get the size of the escaped newline + unsigned EscapeSize = Lexer::getEscapedNewLineSize(Current); + if (EscapeSize > 0) { + // Skip back over the entire escaped newline sequence (backslash + newline) + Current -= (1 + EscapeSize); + } else { + break; + } } else { break; } diff --git a/clang/test/ClangScanDeps/raw-strings.cpp b/clang/test/ClangScanDeps/raw-strings.cpp index 5fda4a559c9e3..ff8a986643993 100644 --- a/clang/test/ClangScanDeps/raw-strings.cpp +++ b/clang/test/ClangScanDeps/raw-strings.cpp @@ -12,6 +12,16 @@ //--- include/header2.h //--- include/header3.h //--- include/header4.h +//--- include/header5.h +//--- include/header6.h +//--- include/header7.h +//--- include/header8.h +//--- include/header9.h +//--- include/header10.h +//--- include/header11.h +//--- include/header12.h +//--- include/header13.h +//--- include/header14.h //--- tu.c #if 0 R"x()x" @@ -47,9 +57,92 @@ R"z()y"; //")z" #endif +// Test u8 prefix with escaped newline +#if 0 +u8R\ +"prefix(test)prefix" +#endif +#include "header5.h" + +// Test u prefix with multiple escaped newlines +#if 0 +uR\ +\ +"multi(test)multi" +#endif +#include "header6.h" + +// Test U prefix with escaped newline +#if 0 +UR\ +"upper(test)upper" +#endif +#include "header7.h" + +// Test L prefix with escaped newline +#if 0 +LR\ +"wide(test)wide" +#endif +#include "header8.h" + +// Test escaped newline with \r\n style +#if 0 +R\ +"crlf(test)crlf" +#endif +#include "header9.h" + +// Test multiple escaped newlines in different positions +#if 0 +u\ +8\ +R\ +"complex(test)complex" +#endif +#include "header10.h" + +// Test raw string that should NOT be treated as raw (no R prefix due to identifier continuation) +#if 0 +identifierR"notraw(test)notraw" +#endif +#include "header11.h" + +// Test raw string with whitespace before escaped newline +#if 0 +R \ +"whitespace(test)whitespace" +#endif +#include "header12.h" + +// Test nested raw strings in disabled code +#if 0 +R"outer( + R"inner(content)inner" +)outer" +#endif +#include "header13.h" + +// Test raw string with empty delimiter +#if 0 +R\ +"(empty delimiter)"; +#endif +#include "header14.h" + // RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess | FileCheck %s // RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess-dependency-directives | FileCheck %s // CHECK: tu.c // CHECK-NEXT: header.h // CHECK-NEXT: header3.h // CHECK-NEXT: header4.h +// CHECK-NEXT: header5.h +// CHECK-NEXT: header6.h +// CHECK-NEXT: header7.h +// CHECK-NEXT: header8.h +// CHECK-NEXT: header9.h +// CHECK-NEXT: header10.h +// CHECK-NEXT: header11.h +// CHECK-NEXT: header12.h +// CHECK-NEXT: header13.h +// CHECK-NEXT: header14.h >From c3561b0aa34f1b693fa86f2b69b09ea97afedb06 Mon Sep 17 00:00:00 2001 From: Tobias Hieta <tobias.hi...@ubisoft.com> Date: Mon, 23 Jun 2025 11:13:07 +0200 Subject: [PATCH 4/4] Remove assertion that's no longer valid here --- clang/lib/Lex/DependencyDirectivesScanner.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp index eb10cfe4cbf45..56dbb3fbcad6e 100644 --- a/clang/lib/Lex/DependencyDirectivesScanner.cpp +++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp @@ -258,7 +258,6 @@ static char previousChar(const char *First, const char *&Current) { static void skipRawString(const char *&First, const char *const End) { assert(First[0] == '"'); - //assert(First[-1] == 'R'); const char *Last = ++First; while (Last != End && *Last != '(') _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits