i18nutil/CppunitTest_i18nutil_unicodeescape.mk | 25 +++++ i18nutil/Library_i18nutil.mk | 1 i18nutil/Module_i18nutil.mk | 1 i18nutil/qa/cppunit/test_unicodeescape.cxx | 121 +++++++++++++++++++++++++ i18nutil/source/utility/unicodeescape.cxx | 102 +++++++++++++++++++++ include/i18nutil/unicodeescape.hxx | 31 ++++++ sc/qa/unit/ucalc.cxx | 51 ++++++++++ sc/source/core/data/table6.cxx | 2 sw/qa/extras/uiwriter/uiwriter7.cxx | 52 ++++++++++ sw/source/core/crsr/findtxt.cxx | 10 +- 10 files changed, 395 insertions(+), 1 deletion(-)
New commits: commit fc800ec1791d5b0b614577b345fbea65933a229b Author: AnamayNarkar <[email protected]> AuthorDate: Mon Mar 2 05:15:23 2026 +0530 Commit: Mike Kaganski <[email protected]> CommitDate: Tue Mar 3 07:19:46 2026 +0100 tdf#106137 sc: process Unicode escapes in regex replacement strings This uses i18nutil::processUnicodeEscapes(), introduced in the parent commit. Change-Id: Ieb780d494ea3cdb8a976bb4ca33d95baa7127c8e Reviewed-on: https://gerrit.libreoffice.org/c/core/+/200739 Tested-by: Jenkins Reviewed-by: Mike Kaganski <[email protected]> diff --git a/sc/qa/unit/ucalc.cxx b/sc/qa/unit/ucalc.cxx index 34b6f4e2bda0..79c579016ac6 100644 --- a/sc/qa/unit/ucalc.cxx +++ b/sc/qa/unit/ucalc.cxx @@ -4460,6 +4460,57 @@ CPPUNIT_TEST_FIXTURE(Test, testSearchCells) m_pDoc->DeleteTab(0); } +CPPUNIT_TEST_FIXTURE(Test, testTdf106137_UnicodeEscapeInReplacement) +{ + m_pDoc->InsertTab(0, u"Test"_ustr); + + // test basic unicode escape expansion + // unicode values in replacement strings should expand to Unicode character when regular expressions is selected + m_pDoc->SetString(ScAddress(0, 0, 0), u"hello world"_ustr); + + SvxSearchItem aItem(SID_SEARCH_ITEM); + aItem.SetSearchString(u"world"_ustr); + aItem.SetReplaceString(u"\u0041\u0042\u0043"_ustr); + aItem.SetCommand(SvxSearchCmd::REPLACE_ALL); + aItem.SetRegExp(true); + + ScMarkData aMarkData(m_pDoc->GetSheetLimits()); + aMarkData.SelectOneTable(0); + SCCOL nCol = 0; + SCROW nRow = 0; + SCTAB nTab = 0; + ScRangeList aMatchedRanges; + OUString aUndoStr; + bool bClamped = false; + + m_pDoc->SearchAndReplace(aItem, nCol, nRow, nTab, aMarkData, aMatchedRanges, aUndoStr, nullptr, + bClamped); + CPPUNIT_ASSERT_EQUAL(u"hello ABC"_ustr, m_pDoc->GetString(ScAddress(0, 0, 0))); + + // test Backreference edge case + // \uXXXX in replacement template should expand before back-references are substituted, + // so that the 'searched' content containing literal \u sequences is not accidentally expanded + + // Insert text that contains an actual escaped unicode string + m_pDoc->SetString(ScAddress(0, 1, 0), u"find \u0042"_ustr); // A2 + + // Capture the literal "\u0042" in a regex group + aItem.SetSearchString(u"(find .*)"_ustr); + // Replace with \u0041 (which should become 'A') + the backreference + aItem.SetReplaceString(u"\u0041 $1"_ustr); + + nCol = 0; + nRow = 0; + nTab = 0; + m_pDoc->SearchAndReplace(aItem, nCol, nRow, nTab, aMarkData, aMatchedRanges, aUndoStr, nullptr, + bClamped); + + // shouldn't be 'A find B' + CPPUNIT_ASSERT_EQUAL(u"A find \u0042"_ustr, m_pDoc->GetString(ScAddress(0, 1, 0))); + + m_pDoc->DeleteTab(0); +} + CPPUNIT_TEST_FIXTURE(Test, testFormulaPosition) { m_pDoc->InsertTab(0, u"Test"_ustr); diff --git a/sc/source/core/data/table6.cxx b/sc/source/core/data/table6.cxx index 970ae62e6467..89803bb125cb 100644 --- a/sc/source/core/data/table6.cxx +++ b/sc/source/core/data/table6.cxx @@ -32,6 +32,7 @@ #include <markdata.hxx> #include <editutil.hxx> #include <postit.hxx> +#include <i18nutil/unicodeescape.hxx> namespace { @@ -185,6 +186,7 @@ bool ScTable::SearchCell(const SvxSearchItem& rSearchItem, SCCOL nCol, sc::Colum OUString sReplStr = rSearchItem.GetReplaceString(); if (rSearchItem.GetRegExp()) { + sReplStr = i18nutil::processUnicodeEscapes(sReplStr); utl::TextSearch::ReplaceBackReferences( sReplStr, aString, aSearchResult ); OUStringBuffer aStrBuffer(aString); aStrBuffer.remove(nStart, nEnd-nStart+1); commit 5c7833fee241d2ec840b0a9998d6900cdd844e58 Author: AnamayNarkar <[email protected]> AuthorDate: Sat Feb 21 15:44:14 2026 +0530 Commit: Mike Kaganski <[email protected]> CommitDate: Tue Mar 3 07:19:39 2026 +0100 tdf#106137 sw: process Unicode escapes in regex replacement strings When using Find & Replace with regular expressions enabled, \uhhhh and \Uhhhhhhhh sequences in the replacement string are now expanded to their corresponding Unicode characters. This uses i18nutil::processUnicodeEscapes(), introduced in the parent commit. Change-Id: Ifc2a634e9b5eb2589a46b309441ea7691d2321b7 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/199924 Tested-by: Jenkins Reviewed-by: Mike Kaganski <[email protected]> diff --git a/sw/qa/extras/uiwriter/uiwriter7.cxx b/sw/qa/extras/uiwriter/uiwriter7.cxx index c22b02d415e0..cb66eff3471f 100644 --- a/sw/qa/extras/uiwriter/uiwriter7.cxx +++ b/sw/qa/extras/uiwriter/uiwriter7.cxx @@ -2961,6 +2961,58 @@ CPPUNIT_TEST_FIXTURE(SwUiWriterTest7, testTdf149089) CPPUNIT_ASSERT_EQUAL(nGridWidth1, nGridWidth2); } +CPPUNIT_TEST_FIXTURE(SwUiWriterTest7, testTdf106137_UnicodeEscapeInReplacement) +{ + // unicode values in replacement strings should expand to Unicode character when regular expressions is selected as a option + createSwDoc(); + SwDoc* pDoc = getSwDoc(); + SwCursorShell* pShell(pDoc->GetEditShell()); + CPPUNIT_ASSERT(pShell); + SwPaM* pCursor = pShell->GetCursor(); + IDocumentContentOperations& rIDCO(pDoc->getIDocumentContentOperations()); + + rIDCO.InsertString(*pCursor, u"hello world"_ustr); + + uno::Reference<util::XReplaceable> xReplace(mxComponent, uno::UNO_QUERY); + uno::Reference<util::XReplaceDescriptor> xReplaceDes = xReplace->createReplaceDescriptor(); + xReplaceDes->setPropertyValue(u"SearchRegularExpression"_ustr, uno::Any(true)); + + xReplaceDes->setSearchString(u"world"_ustr); + xReplaceDes->setReplaceString(u"\u0041\u0042\u0043"_ustr); + sal_Int32 nCount = xReplace->replaceAll(xReplaceDes); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), nCount); + CPPUNIT_ASSERT_EQUAL(u"hello ABC"_ustr, pCursor->GetPointNode().GetTextNode()->GetText()); +} + +CPPUNIT_TEST_FIXTURE(SwUiWriterTest7, testTdf106137_UnicodeEscapeInReplacement_BackRef) +{ + // \uXXXX in replacement template should expand before back-references are substituted, + // so that the 'searched' content containing literal \u sequences is not accidentally expanded + createSwDoc(); + SwDoc* pDoc = getSwDoc(); + SwCursorShell* pShell(pDoc->GetEditShell()); + CPPUNIT_ASSERT(pShell); + SwPaM* pCursor = pShell->GetCursor(); + IDocumentContentOperations& rIDCO(pDoc->getIDocumentContentOperations()); + + // Insert text that contains a actual excaped unicode string + rIDCO.InsertString(*pCursor, u"find \u0042"_ustr); + + uno::Reference<util::XReplaceable> xReplace(mxComponent, uno::UNO_QUERY); + uno::Reference<util::XReplaceDescriptor> xReplaceDes = xReplace->createReplaceDescriptor(); + xReplaceDes->setPropertyValue(u"SearchRegularExpression"_ustr, uno::Any(true)); + + // Capture the literal "\u0042" in a regex group + xReplaceDes->setSearchString(u"(find .*)"_ustr); + // Replace with \u0041 (which should become 'A') + the backreference + xReplaceDes->setReplaceString(u"\u0041 $1"_ustr); + sal_Int32 nCount = xReplace->replaceAll(xReplaceDes); + + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), nCount); + // shouldn't be 'A find B' + CPPUNIT_ASSERT_EQUAL(u"A find \u0042"_ustr, pCursor->GetPointNode().GetTextNode()->GetText()); +} + CPPUNIT_PLUGIN_IMPLEMENT(); /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sw/source/core/crsr/findtxt.cxx b/sw/source/core/crsr/findtxt.cxx index 07a19bde59b7..264b1261cafa 100644 --- a/sw/source/core/crsr/findtxt.cxx +++ b/sw/source/core/crsr/findtxt.cxx @@ -53,6 +53,7 @@ #include <docsh.hxx> #include <PostItMgr.hxx> #include <view.hxx> +#include <i18nutil/unicodeescape.hxx> using namespace ::com::sun::star; using namespace util; @@ -984,8 +985,14 @@ int SwFindParaText::DoFind(SwPaM & rCursor, SwMoveFnCollection const & fnMove, std::optional<OUString> xRepl; if (bRegExp) xRepl = sw::ReplaceBackReferences(m_rSearchOpt, &rCursor, m_pLayout); + + // process \uhhhh and \Uhhhhhhhh escapes for regex replacements + OUString aFinalReplStr = xRepl ? *xRepl : m_rSearchOpt.replaceString; + if (bRegExp && !xRepl) // fallback for when ReplaceBackReferences returns null + aFinalReplStr = i18nutil::processUnicodeEscapes(aFinalReplStr); + bool const bReplaced = sw::ReplaceImpl(rCursor, - xRepl ? *xRepl : m_rSearchOpt.replaceString, + aFinalReplStr, bRegExp, m_rCursor.GetDoc(), m_pLayout); m_rCursor.SaveTableBoxContent( rCursor.GetPoint() ); @@ -1149,6 +1156,7 @@ std::optional<OUString> ReplaceBackReferences(const i18nutil::SearchOptions2& rS utl::TextSearch aSText(rSearchOpt); SearchResult aResult; OUString aReplaceStr( rSearchOpt.replaceString ); + aReplaceStr = i18nutil::processUnicodeEscapes(aReplaceStr); if (bParaEnd) { static constexpr OUString aStr(u"\n"_ustr); commit 5936fab3e6293101a536f55904ff2072f2bb133e Author: AnamayNarkar <[email protected]> AuthorDate: Fri Feb 6 02:24:09 2026 +0530 Commit: Mike Kaganski <[email protected]> CommitDate: Tue Mar 3 07:19:32 2026 +0100 tdf#106137 i18nutil: Add helper to process Unicode escapes Adds a new utility function processUnicodeEscapes() to i18nutil. This supports parsing: * Standard escapes: \uhhhh (e.g. \u0041) * Extended escapes: \Uhhhhhhhh (e.g. \U0001F600) * Escaped backslashes: \ Change-Id: I589eb2403376c8cef694f52f1c785d42fba94ff9 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/198777 Tested-by: Jenkins Reviewed-by: Mike Kaganski <[email protected]> diff --git a/i18nutil/CppunitTest_i18nutil_unicodeescape.mk b/i18nutil/CppunitTest_i18nutil_unicodeescape.mk new file mode 100644 index 000000000000..7efe55097331 --- /dev/null +++ b/i18nutil/CppunitTest_i18nutil_unicodeescape.mk @@ -0,0 +1,25 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +$(eval $(call gb_CppunitTest_CppunitTest,i18nutil_unicodeescape)) + +$(eval $(call gb_CppunitTest_use_external,i18nutil_unicodeescape,boost_headers)) + +$(eval $(call gb_CppunitTest_add_exception_objects,i18nutil_unicodeescape, \ + i18nutil/qa/cppunit/test_unicodeescape \ +)) + +$(eval $(call gb_CppunitTest_use_libraries,i18nutil_unicodeescape, \ + cppu \ + cppuhelper \ + i18nutil \ + sal \ +)) + +# vim: set noet sw=4 ts=4: diff --git a/i18nutil/Library_i18nutil.mk b/i18nutil/Library_i18nutil.mk index cf166a476401..4c6923e8a621 100644 --- a/i18nutil/Library_i18nutil.mk +++ b/i18nutil/Library_i18nutil.mk @@ -52,6 +52,7 @@ $(eval $(call gb_Library_add_exception_objects,i18nutil,\ i18nutil/source/utility/scriptclass \ i18nutil/source/utility/scripttypedetector \ i18nutil/source/utility/unicode \ + i18nutil/source/utility/unicodeescape \ i18nutil/source/utility/widthfolding \ )) diff --git a/i18nutil/Module_i18nutil.mk b/i18nutil/Module_i18nutil.mk index 3fac872e83ed..dc9bc57c62cf 100644 --- a/i18nutil/Module_i18nutil.mk +++ b/i18nutil/Module_i18nutil.mk @@ -14,6 +14,7 @@ $(eval $(call gb_Module_add_targets,i18nutil,\ $(eval $(call gb_Module_add_check_targets,i18nutil,\ CppunitTest_i18nutil \ + CppunitTest_i18nutil_unicodeescape \ )) # vim: set noet sw=4: diff --git a/i18nutil/qa/cppunit/test_unicodeescape.cxx b/i18nutil/qa/cppunit/test_unicodeescape.cxx new file mode 100644 index 000000000000..2cb098e692b8 --- /dev/null +++ b/i18nutil/qa/cppunit/test_unicodeescape.cxx @@ -0,0 +1,121 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <cppunit/TestFixture.h> +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/plugin/TestPlugIn.h> +#include <i18nutil/unicodeescape.hxx> +#include <o3tl/string_view.hxx> + +using namespace o3tl; + +class TestUnicodeEscape : public CppUnit::TestFixture +{ +public: + void testBasicUnicodeEscape(); + void testExtendedUnicodeEscape(); + void testEscapedBackslash(); + void testInvalidEscape(); + void testControlCharFiltering(); + void testSurrogatePair(); + + CPPUNIT_TEST_SUITE(TestUnicodeEscape); + CPPUNIT_TEST(testBasicUnicodeEscape); + CPPUNIT_TEST(testExtendedUnicodeEscape); + CPPUNIT_TEST(testEscapedBackslash); + CPPUNIT_TEST(testInvalidEscape); + CPPUNIT_TEST(testControlCharFiltering); + CPPUNIT_TEST(testSurrogatePair); + CPPUNIT_TEST_SUITE_END(); +}; + +void TestUnicodeEscape::testBasicUnicodeEscape() +{ + // \u0041 should become 'A' + OUString input(u"\u0041"_ustr); + OUString result = i18nutil::processUnicodeEscapes(input); + CPPUNIT_ASSERT_EQUAL(u"A"_ustr, result); + + // Multiple escapes + input = u"\u0041\u0042\u0043"_ustr; + result = i18nutil::processUnicodeEscapes(input); + CPPUNIT_ASSERT_EQUAL(u"ABC"_ustr, result); + + // Mixed with text + input = u"Hello \u0041 World"_ustr; + result = i18nutil::processUnicodeEscapes(input); + CPPUNIT_ASSERT_EQUAL(u"Hello A World"_ustr, result); +} + +void TestUnicodeEscape::testExtendedUnicodeEscape() +{ + // \U00000041 should become 'A' + OUString result = i18nutil::processUnicodeEscapes(u"\U00000041"); + CPPUNIT_ASSERT_EQUAL(u"A"_ustr, result); +} + +void TestUnicodeEscape::testEscapedBackslash() +{ + // \u0041 should become literal \u0041 + OUString result = i18nutil::processUnicodeEscapes(u"\\u0041"); + CPPUNIT_ASSERT_EQUAL(u"\u0041"_ustr, result); +} + +void TestUnicodeEscape::testInvalidEscape() +{ + // Invalid hex should keep literal + OUString input(u"\uXYZW"_ustr); + OUString result = i18nutil::processUnicodeEscapes(input); + CPPUNIT_ASSERT_EQUAL(input, result); + + // Incomplete escape should keep literal + input = u"\u004"_ustr; + result = i18nutil::processUnicodeEscapes(input); + CPPUNIT_ASSERT_EQUAL(input, result); +} + +void TestUnicodeEscape::testControlCharFiltering() +{ + // \u000A (LF) should be allowed + OUString input(u"\u000A"_ustr); + OUString result = i18nutil::processUnicodeEscapes(input); + CPPUNIT_ASSERT_EQUAL(u" "_ustr, result); + + // \u0009 (TAB) should be allowed + input = u"\u0009"_ustr; + result = i18nutil::processUnicodeEscapes(input); + CPPUNIT_ASSERT_EQUAL(u" "_ustr, result); + + // \u0000 (NULL) should be filtered + input = u"\u0000"_ustr; + result = i18nutil::processUnicodeEscapes(input); + CPPUNIT_ASSERT_EQUAL(input, result); // Should remain literal + + input = u"\\uXYZW"_ustr; + result = i18nutil::processUnicodeEscapes(input); + + CPPUNIT_ASSERT_EQUAL(u"\uXYZW"_ustr, result); +} + +void TestUnicodeEscape::testSurrogatePair() +{ + // \U0001F600 (emoji) should work with surrogate pairs + OUString result = i18nutil::processUnicodeEscapes(u"\U0001F600"); + + // U+1F600 = UTF-16: 0xD83D 0xDE00 + sal_Unicode surrogates[2] = { 0xD83D, 0xDE00 }; + OUString expected(surrogates, 2); + + CPPUNIT_ASSERT_EQUAL(expected, result); +} + +CPPUNIT_TEST_SUITE_REGISTRATION(TestUnicodeEscape); +CPPUNIT_PLUGIN_IMPLEMENT(); + +/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */ diff --git a/i18nutil/source/utility/unicodeescape.cxx b/i18nutil/source/utility/unicodeescape.cxx new file mode 100644 index 000000000000..cf9fee576565 --- /dev/null +++ b/i18nutil/source/utility/unicodeescape.cxx @@ -0,0 +1,102 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <i18nutil/unicodeescape.hxx> +#include <rtl/character.hxx> +#include <rtl/ustrbuf.hxx> +#include <o3tl/numeric.hxx> +#include <cassert> + +namespace i18nutil +{ +namespace +{ +// Convert hex string to codepoint +sal_uInt32 hexToCodepoint(std::u16string_view hexStr) +{ + assert(hexStr.size() <= 8); + + sal_uInt32 result = 0; + for (sal_Unicode c : hexStr) + { + sal_Int32 hexValue = o3tl::convertToHex<sal_Int32>(c); + if (hexValue < 0) + return 0xFFFFFFFF; // Invalid + result = (result << 4) | hexValue; + } + return result; +} + +// Helper: Check if codepoint is allowed +bool isAllowedCodepoint(sal_uInt32 cp) +{ + if (cp < 0x20) + { + return cp == 0x09 || cp == 0x0A || cp == 0x0D; // TAB, LF, CR + } + return rtl::isUnicodeCodePoint(cp); +} + +} // local namespace + +OUString processUnicodeEscapes(std::u16string_view input) +{ + OUStringBuffer result; + sal_Int32 len = input.size(); + + for (sal_Int32 i = 0; i < len; ++i) + { + // Check for backslash and ensure we have at least one char after it + if (input[i] == '\' && i + 1 < len) + { + sal_Unicode next = input[i + 1]; + + // Handle escaped backslash: (backslash)(backslash) -> (backslash) + if (next == '\') + { + result.append('\'); + ++i; // Skip the second backslash + continue; + } + + // Handle \uhhhh (4 hex digits) + else if (next == 'u' && i + 5 < len) + { + sal_uInt32 codepoint = hexToCodepoint(input.substr(i + 2, 4)); + if (isAllowedCodepoint(codepoint)) + { + result.append(sal_Unicode(codepoint)); + i += 5; // Skip \uhhhh + continue; + } + } + + // Handle \Uhhhhhhhh (8 hex digits) + else if (next == 'U' && i + 9 < len) + { + sal_uInt32 codepoint = hexToCodepoint(input.substr(i + 2, 8)); + if (isAllowedCodepoint(codepoint)) + { + result.appendUtf32(codepoint); + i += 9; // Skip \Uhhhhhhhh + continue; + } + } + } + + // If not a valid escape, or just a regular character, append it as is + result.append(input[i]); + } + + return result.makeStringAndClear(); +} + +} // namespace i18nutil + +/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */ diff --git a/include/i18nutil/unicodeescape.hxx b/include/i18nutil/unicodeescape.hxx new file mode 100644 index 000000000000..064d6d82e380 --- /dev/null +++ b/include/i18nutil/unicodeescape.hxx @@ -0,0 +1,31 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include <rtl/ustring.hxx> +#include <i18nutil/i18nutildllapi.h> + +namespace i18nutil +{ +/** + * Process Unicode escape sequences in a string. + * Converts \uhhhh (4 hex digits) and \Uhhhhhhhh (8 hex digits) to their + * corresponding Unicode characters. + * Handles escaped backslashes: \ becomes \ + * Filters out control characters < 0x20 except TAB, LF, and CR. + * + * @param input String potentially containing Unicode escape sequences + * @return String with escape sequences converted to actual Unicode characters + */ +I18NUTIL_DLLPUBLIC OUString processUnicodeEscapes(std::u16string_view input); + +} // namespace i18nutil + +/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */
