https://github.com/shivrm created 
https://github.com/llvm/llvm-project/pull/167150

String literals with user-defined suffixes can now be split between lines.
 - Uses regex to identify user-defined suffixes
 - We want the suffix to be placed only on the last line, so I added 
`ContinuationPrefix` and `ContinuationPostfix` attributes to 
`BreakableStringLiteral` to have different postfixes for the last line and all 
the other lines
 - `ContinuationPrefix` is currently unused - prefixes are still placed on 
every line when splitting. I've kept it for completeness.
 - Adds a new unit test for splitting strings with user-defined-suffixes.

Fixes #165617

>From d211fd1030494d0db230ccd608f935edc5af8406 Mon Sep 17 00:00:00 2001
From: shivrm <[email protected]>
Date: Fri, 7 Nov 2025 17:02:47 +0530
Subject: [PATCH 1/4] Add splitting for user-defined suffixes

---
 clang/lib/Format/BreakableToken.cpp       | 21 ++++++++---
 clang/lib/Format/BreakableToken.h         | 12 +++++--
 clang/lib/Format/ContinuationIndenter.cpp | 44 +++++++++++++++++------
 3 files changed, 61 insertions(+), 16 deletions(-)

diff --git a/clang/lib/Format/BreakableToken.cpp 
b/clang/lib/Format/BreakableToken.cpp
index 994a427517ffc..ff9f2f10ffac0 100644
--- a/clang/lib/Format/BreakableToken.cpp
+++ b/clang/lib/Format/BreakableToken.cpp
@@ -253,10 +253,13 @@ unsigned 
BreakableStringLiteral::getContentStartColumn(unsigned LineIndex,
 
 BreakableStringLiteral::BreakableStringLiteral(
     const FormatToken &Tok, unsigned StartColumn, StringRef Prefix,
-    StringRef Postfix, unsigned UnbreakableTailLength, bool InPPDirective,
-    encoding::Encoding Encoding, const FormatStyle &Style)
+    StringRef Postfix, StringRef ContinuationPrefix,
+    StringRef ContinuationPostfix, unsigned UnbreakableTailLength,
+    bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
     : BreakableToken(Tok, InPPDirective, Encoding, Style),
       StartColumn(StartColumn), Prefix(Prefix), Postfix(Postfix),
+      ContinuationPrefix(ContinuationPrefix),
+      ContinuationPostfix(ContinuationPostfix),
       UnbreakableTailLength(UnbreakableTailLength) {
   assert(Tok.TokenText.starts_with(Prefix) && 
Tok.TokenText.ends_with(Postfix));
   Line = Tok.TokenText.substr(
@@ -274,9 +277,15 @@ void BreakableStringLiteral::insertBreak(unsigned 
LineIndex,
                                          unsigned TailOffset, Split Split,
                                          unsigned ContentIndent,
                                          WhitespaceManager &Whitespaces) const 
{
+
+  const unsigned SplitEnd = TailOffset + Split.first + Split.second;
+  const bool IsLastFragment = SplitEnd >= Line.size() - UnbreakableTailLength;
+
+  StringRef LocalPostfix = (IsLastFragment) ? Postfix : ContinuationPostfix;
+
   Whitespaces.replaceWhitespaceInToken(
-      Tok, Prefix.size() + TailOffset + Split.first, Split.second, Postfix,
-      Prefix, InPPDirective, 1, StartColumn);
+      Tok, ContinuationPrefix.size() + TailOffset + Split.first, Split.second,
+      LocalPostfix, ContinuationPrefix, InPPDirective, 1, StartColumn);
 }
 
 BreakableStringLiteralUsingOperators::BreakableStringLiteralUsingOperators(
@@ -288,6 +297,10 @@ 
BreakableStringLiteralUsingOperators::BreakableStringLiteralUsingOperators(
                             : QuoteStyle == AtDoubleQuotes        ? "@\""
                                                                   : "\"",
           /*Postfix=*/QuoteStyle == SingleQuotes ? "'" : "\"",
+          /*ContinuationPrefix=*/QuoteStyle == SingleQuotes ? "'"
+                               : QuoteStyle == AtDoubleQuotes                  
  ? "@\""
+                                                            : "\"",
+          /*ContinuationPostfix=*/QuoteStyle == SingleQuotes ? "'" : "\"",
           UnbreakableTailLength, InPPDirective, Encoding, Style),
       BracesNeeded(Tok.isNot(TT_StringInConcatenation)),
       QuoteStyle(QuoteStyle) {
diff --git a/clang/lib/Format/BreakableToken.h 
b/clang/lib/Format/BreakableToken.h
index 45c00b35fd01e..2ee37d3e0e059 100644
--- a/clang/lib/Format/BreakableToken.h
+++ b/clang/lib/Format/BreakableToken.h
@@ -252,6 +252,8 @@ class BreakableStringLiteral : public BreakableToken {
   /// after formatting.
   BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
                          StringRef Prefix, StringRef Postfix,
+                         StringRef ContinuationPrefix,
+                         StringRef ContinuationPostfix,
                          unsigned UnbreakableTailLength, bool InPPDirective,
                          encoding::Encoding Encoding, const FormatStyle 
&Style);
 
@@ -274,15 +276,21 @@ class BreakableStringLiteral : public BreakableToken {
 protected:
   // The column in which the token starts.
   unsigned StartColumn;
-  // The prefix a line needs after a break in the token.
+  // The prefix a line needs at the start
   StringRef Prefix;
-  // The postfix a line needs before introducing a break.
+  // The postfix a line needs at the end
   StringRef Postfix;
+  // The prefix every line except the first line needs
+  StringRef ContinuationPrefix;
+  // The postfix every line except the last line needs
+  StringRef ContinuationPostfix;
   // The token text excluding the prefix and postfix.
   StringRef Line;
   // Length of the sequence of tokens after this string literal that cannot
   // contain line breaks.
   unsigned UnbreakableTailLength;
+  // Whether the string prefix and postfix should be repeated on each line
+  // when breaking the string.
 };
 
 class BreakableStringLiteralUsingOperators : public BreakableStringLiteral {
diff --git a/clang/lib/Format/ContinuationIndenter.cpp 
b/clang/lib/Format/ContinuationIndenter.cpp
index 9ab024a03fbd7..6cfb7a505200e 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -2540,22 +2540,46 @@ ContinuationIndenter::createBreakableToken(const 
FormatToken &Current,
 
     StringRef Prefix;
     StringRef Postfix;
+
     // FIXME: Handle whitespace between '_T', '(', '"..."', and ')'.
     // FIXME: Store Prefix and Suffix (or PrefixLength and SuffixLength to
     // reduce the overhead) for each FormatToken, which is a string, so that we
     // don't run multiple checks here on the hot path.
-    if ((Text.ends_with(Postfix = "\"") &&
-         (Text.starts_with(Prefix = "@\"") || Text.starts_with(Prefix = "\"") 
||
-          Text.starts_with(Prefix = "u\"") ||
-          Text.starts_with(Prefix = "U\"") ||
-          Text.starts_with(Prefix = "u8\"") ||
-          Text.starts_with(Prefix = "L\""))) ||
-        (Text.starts_with(Prefix = "_T(\"") &&
-         Text.ends_with(Postfix = "\")"))) {
+    if (Text.starts_with(Prefix = "_T(\"") && Text.ends_with(Postfix = "\")")) 
{
+      // We need to put `_T("` and `")` on each line because it is a macro
+      llvm::StringRef ContinuationPrefix = Prefix;
+      llvm::StringRef ContinuationPostfix = Postfix;
+
       return std::make_unique<BreakableStringLiteral>(
-          Current, StartColumn, Prefix, Postfix, UnbreakableTailLength,
-          State.Line->InPPDirective, Encoding, Style);
+          Current, StartColumn, Prefix, Postfix, ContinuationPrefix,
+          ContinuationPostfix, UnbreakableTailLength, 
State.Line->InPPDirective,
+          Encoding, Style);
+    }
+
+    static const auto PostfixRegex =
+        llvm::Regex(R"("(_[a-zA-Z_][a-zA-Z0-9_]*)?$)");
+    llvm::SmallVector<llvm::StringRef, 1> Matches;
+
+    if (PostfixRegex.match(Text, &Matches)) {
+      Postfix = Matches.front();
+
+      if ((Text.starts_with(Prefix = "@\"") ||
+           Text.starts_with(Prefix = "\"") ||
+           Text.starts_with(Prefix = "u\"") ||
+           Text.starts_with(Prefix = "U\"") ||
+           Text.starts_with(Prefix = "u8\"") ||
+           Text.starts_with(Prefix = "L\""))) {
+
+        // Use quotes when breaking the string
+        llvm::StringRef ContinuationPrefix = "\"";
+        llvm::StringRef ContinuationPostfix = "\"";
+        return std::make_unique<BreakableStringLiteral>(
+            Current, StartColumn, Prefix, Postfix, ContinuationPrefix,
+            ContinuationPostfix, UnbreakableTailLength,
+            State.Line->InPPDirective, Encoding, Style);
+      }
     }
+
   } else if (Current.is(TT_BlockComment)) {
     if (Style.ReflowComments == FormatStyle::RCS_Never ||
         // If a comment token switches formatting, like

>From 93060fdd0a3b03ed6a9c38a06a5e6819f67c13e4 Mon Sep 17 00:00:00 2001
From: shivrm <[email protected]>
Date: Fri, 7 Nov 2025 22:06:12 +0530
Subject: [PATCH 2/4] Modify string splitting to repeat prefix

---
 clang/lib/Format/BreakableToken.cpp       | 2 +-
 clang/lib/Format/ContinuationIndenter.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Format/BreakableToken.cpp 
b/clang/lib/Format/BreakableToken.cpp
index ff9f2f10ffac0..872660535eb35 100644
--- a/clang/lib/Format/BreakableToken.cpp
+++ b/clang/lib/Format/BreakableToken.cpp
@@ -298,7 +298,7 @@ 
BreakableStringLiteralUsingOperators::BreakableStringLiteralUsingOperators(
                                                                   : "\"",
           /*Postfix=*/QuoteStyle == SingleQuotes ? "'" : "\"",
           /*ContinuationPrefix=*/QuoteStyle == SingleQuotes ? "'"
-                               : QuoteStyle == AtDoubleQuotes                  
  ? "@\""
+          : QuoteStyle == AtDoubleQuotes                    ? "@\""
                                                             : "\"",
           /*ContinuationPostfix=*/QuoteStyle == SingleQuotes ? "'" : "\"",
           UnbreakableTailLength, InPPDirective, Encoding, Style),
diff --git a/clang/lib/Format/ContinuationIndenter.cpp 
b/clang/lib/Format/ContinuationIndenter.cpp
index 6cfb7a505200e..5badd6edf4a7b 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -2570,8 +2570,8 @@ ContinuationIndenter::createBreakableToken(const 
FormatToken &Current,
            Text.starts_with(Prefix = "u8\"") ||
            Text.starts_with(Prefix = "L\""))) {
 
-        // Use quotes when breaking the string
-        llvm::StringRef ContinuationPrefix = "\"";
+        // Repeat the prefix on every line but don't repeat the suffix
+        llvm::StringRef ContinuationPrefix = Prefix;
         llvm::StringRef ContinuationPostfix = "\"";
         return std::make_unique<BreakableStringLiteral>(
             Current, StartColumn, Prefix, Postfix, ContinuationPrefix,

>From 91c9b81e83f82af2103f258b699fa5202fc2af89 Mon Sep 17 00:00:00 2001
From: shivrm <[email protected]>
Date: Fri, 7 Nov 2025 22:25:39 +0530
Subject: [PATCH 3/4] Fix bug causing repetition of suffixes

---
 clang/lib/Format/BreakableToken.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/Format/BreakableToken.cpp 
b/clang/lib/Format/BreakableToken.cpp
index 872660535eb35..dd9d4ecb2f3c7 100644
--- a/clang/lib/Format/BreakableToken.cpp
+++ b/clang/lib/Format/BreakableToken.cpp
@@ -279,8 +279,7 @@ void BreakableStringLiteral::insertBreak(unsigned LineIndex,
                                          WhitespaceManager &Whitespaces) const 
{
 
   const unsigned SplitEnd = TailOffset + Split.first + Split.second;
-  const bool IsLastFragment = SplitEnd >= Line.size() - UnbreakableTailLength;
-
+  const bool IsLastFragment = SplitEnd > Line.size() - UnbreakableTailLength;
   StringRef LocalPostfix = (IsLastFragment) ? Postfix : ContinuationPostfix;
 
   Whitespaces.replaceWhitespaceInToken(

>From faa1996fa3238d95386d8377da96689714551a25 Mon Sep 17 00:00:00 2001
From: shivrm <[email protected]>
Date: Sat, 8 Nov 2025 21:17:21 +0530
Subject: [PATCH 4/4] Add unit tests

---
 clang/unittests/Format/FormatTest.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/unittests/Format/FormatTest.cpp 
b/clang/unittests/Format/FormatTest.cpp
index 24235b966399d..4c7593b88202f 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -15838,6 +15838,10 @@ TEST_F(FormatTest, BreaksWideAndNSStringLiterals) {
                "@\"NSString literal\";", getGoogleStyleWithColumns(19));
   verifyFormat(R"(NSString *s = @"那那那那";)", getLLVMStyleWithColumns(26));
 
+  EXPECT_EQ("L\"suffixed \"\n"
+            "L\"string\"_s;",
+            format("L\"suffixed string\"_s;", getLLVMStyleWithColumns(19)));
+
   // This input makes clang-format try to split the incomplete unicode escape
   // sequence, which used to lead to a crasher.
   verifyNoCrash(

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to