https://github.com/JDevlieghere created https://github.com/llvm/llvm-project/pull/130878
This PR implements a unicode and ANSI escape code aware function to trim and pad strings. This is a break-out from #121860. >From f6287805c5aab3d5b7b998afe9db4fc1acb3760a Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere <jo...@devlieghere.com> Date: Tue, 11 Mar 2025 18:59:50 -0700 Subject: [PATCH] [lldb] Implement ANSI & Unicode aware string stripping & padding This PR implements a unicode and ANSI escape code aware function to trim and pad strings. This is a break-out from #121860. --- lldb/include/lldb/Utility/AnsiTerminal.h | 101 +++++++++++++++++--- lldb/unittests/Utility/AnsiTerminalTest.cpp | 49 ++++++++++ 2 files changed, 136 insertions(+), 14 deletions(-) diff --git a/lldb/include/lldb/Utility/AnsiTerminal.h b/lldb/include/lldb/Utility/AnsiTerminal.h index 1939c49c7b859..b388f9fdf4441 100644 --- a/lldb/include/lldb/Utility/AnsiTerminal.h +++ b/lldb/include/lldb/Utility/AnsiTerminal.h @@ -70,9 +70,12 @@ #define ANSI_1_CTRL(ctrl1) "\033["##ctrl1 ANSI_ESC_END #define ANSI_2_CTRL(ctrl1, ctrl2) "\033["##ctrl1 ";"##ctrl2 ANSI_ESC_END +#define ANSI_ESC_START_LEN 2 + #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Locale.h" #include <string> @@ -172,28 +175,98 @@ inline std::string FormatAnsiTerminalCodes(llvm::StringRef format, return fmt; } +inline std::tuple<llvm::StringRef, llvm::StringRef, llvm::StringRef> +FindNextAnsiSequence(llvm::StringRef str) { + llvm::StringRef left; + llvm::StringRef right = str; + + while (!right.empty()) { + const size_t start = right.find(ANSI_ESC_START); + + // ANSI_ESC_START not found. + if (start == llvm::StringRef::npos) + return {str, {}, {}}; + + // Split the string around the current ANSI_ESC_START. + left = str.substr(0, left.size() + start); + llvm::StringRef escape = right.substr(start); + right = right.substr(start + ANSI_ESC_START_LEN + 1); + + const size_t end = right.find_first_not_of("0123456789;"); + + // ANSI_ESC_END found. + if (end < right.size() && (right[end] == 'm' || right[end] == 'G')) + return {left, escape.take_front(ANSI_ESC_START_LEN + 1 + end + 1), + right.substr(end + 1)}; + + // Maintain the invariant that str == left + right at the start of the loop. + left = str.substr(0, left.size() + ANSI_ESC_START_LEN + 1); + } + + return {str, {}, {}}; +} + inline std::string StripAnsiTerminalCodes(llvm::StringRef str) { std::string stripped; while (!str.empty()) { - llvm::StringRef left, right; - - std::tie(left, right) = str.split(ANSI_ESC_START); + auto [left, escape, right] = FindNextAnsiSequence(str); stripped += left; + str = right; + } + return stripped; +} - // ANSI_ESC_START not found. - if (left == str && right.empty()) - break; +inline std::string TrimAndPad(llvm::StringRef str, size_t visible_length, + char padding = ' ') { + std::string result; + size_t result_visibile_length = 0; + + // Trim the string to the given visible length. + while (!str.empty()) { + auto [left, escape, right] = FindNextAnsiSequence(str); + str = right; - size_t end = right.find_first_not_of("0123456789;"); - if (end < right.size() && (right[end] == 'm' || right[end] == 'G')) { - str = right.substr(end + 1); - } else { - // ANSI_ESC_END not found. - stripped += ANSI_ESC_START; - str = right; + // Compute the length of the string without escape codes. If it fits, append + // it together with the invisible escape code. + size_t column_width = llvm::sys::locale::columnWidth(left); + if (result_visibile_length + column_width <= visible_length) { + result.append(left).append(escape); + result_visibile_length += column_width; + continue; + } + + // The string doesn't fit but doesn't fit but doesn't contain unicode. + // Append the substring that fits. + if (column_width == left.size()) { + llvm::StringRef trimmed = + left.take_front(visible_length - result_visibile_length); + result.append(trimmed); + result_visibile_length += visible_length - result_visibile_length; + continue; + } + + // The string doesn't fit but contains unicode. Repeatedly trim the string + // until it fits. + llvm::StringRef trimmed = left; + while (!trimmed.empty()) { + // This relies on columnWidth returning -2 for invalid/partial unicode + // characters, which after conversion to size_t will be larger than the + // visible width. + column_width = llvm::sys::locale::columnWidth(trimmed); + if (result_visibile_length + column_width <= visible_length) { + result.append(trimmed); + result_visibile_length += column_width; + break; + } + trimmed = trimmed.drop_back(); } } - return stripped; + + // Pad the string. + if (result_visibile_length < visible_length) + result.append(visible_length - result_visibile_length, padding); + + return result; } } // namespace ansi diff --git a/lldb/unittests/Utility/AnsiTerminalTest.cpp b/lldb/unittests/Utility/AnsiTerminalTest.cpp index 1ba9565c3f6af..cef73ffaf9136 100644 --- a/lldb/unittests/Utility/AnsiTerminalTest.cpp +++ b/lldb/unittests/Utility/AnsiTerminalTest.cpp @@ -67,3 +67,52 @@ TEST(AnsiTerminal, InvalidEscapeCode) { EXPECT_EQ("abc\x1B[31kabcabc", ansi::StripAnsiTerminalCodes("abc\x1B[31kabc\x1B[0mabc")); } + +TEST(AnsiTerminal, FindNextAnsiSequenceBasic) { + auto [left, escape, right] = ansi::FindNextAnsiSequence("foo\x1B[31mbar"); + EXPECT_EQ("foo", left); + EXPECT_EQ("\x1B[31m", escape); + EXPECT_EQ("bar", right); +} + +TEST(AnsiTerminal, FindNextAnsiSequenceIncompleteStart) { + auto [left, escape, right] = + ansi::FindNextAnsiSequence("foo\x1B[bar\x1B[31mbaz"); + EXPECT_EQ("foo\x1B[bar", left); + EXPECT_EQ("\x1B[31m", escape); + EXPECT_EQ("baz", right); +} + +TEST(AnsiTerminal, FindNextAnsiSequenceEscapeStart) { + auto [left, escape, right] = ansi::FindNextAnsiSequence("\x1B[31mfoo"); + EXPECT_EQ("", left); + EXPECT_EQ("\x1B[31m", escape); + EXPECT_EQ("foo", right); +} + +TEST(AnsiTerminal, TrimAndPad) { + // Test basic ASCII. + EXPECT_EQ(" ", ansi::TrimAndPad("", 5)); + EXPECT_EQ("foo ", ansi::TrimAndPad("foo", 5)); + EXPECT_EQ("fooba", ansi::TrimAndPad("fooba", 5)); + EXPECT_EQ("fooba", ansi::TrimAndPad("foobar", 5)); + + // Simple test that ANSI escape codes don't contribute to the visible width. + EXPECT_EQ("\x1B[30m ", ansi::TrimAndPad("\x1B[30m", 5)); + EXPECT_EQ("\x1B[30mfoo ", ansi::TrimAndPad("\x1B[30mfoo", 5)); + EXPECT_EQ("\x1B[30mfooba", ansi::TrimAndPad("\x1B[30mfooba", 5)); + EXPECT_EQ("\x1B[30mfooba", ansi::TrimAndPad("\x1B[30mfoobar", 5)); + + // Test that we include as many escape codes as we can. + EXPECT_EQ("fooba\x1B[30m", ansi::TrimAndPad("fooba\x1B[30m", 5)); + EXPECT_EQ("fooba\x1B[30m\x1B[34m", + ansi::TrimAndPad("fooba\x1B[30m\x1B[34m", 5)); + EXPECT_EQ("fooba\x1B[30m\x1B[34m", + ansi::TrimAndPad("fooba\x1B[30m\x1B[34mr", 5)); + + // Test Unicode. + EXPECT_EQ("❤️ ", ansi::TrimAndPad("❤️", 5)); + EXPECT_EQ(" ❤️", ansi::TrimAndPad(" ❤️", 5)); + EXPECT_EQ("12❤️4❤️", ansi::TrimAndPad("12❤️4❤️", 5)); + EXPECT_EQ("12❤️45", ansi::TrimAndPad("12❤️45❤️", 5)); +} _______________________________________________ lldb-commits mailing list lldb-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits