i18npool/qa/cppunit/test_breakiterator.cxx | 30 ++++++++++++++++++++ i18npool/source/breakiterator/data/dict_word_hu.txt | 2 - i18npool/source/breakiterator/data/edit_word.txt | 8 +++-- i18npool/source/breakiterator/data/edit_word_hu.txt | 8 +++-- 4 files changed, 41 insertions(+), 7 deletions(-)
New commits: commit 2b9fee5a3e9d1eae65932fb0f08f0216f8a30cf7 Author: László Németh <nem...@numbertext.org> AuthorDate: Thu Jun 27 11:06:35 2024 +0200 Commit: László Németh <nem...@numbertext.org> CommitDate: Thu Jun 27 16:49:51 2024 +0200 tdf#161737 i18npool: fix bad word selection with NNBSP Fix word breaking rules also for editing. Previously the word was selected with the following narrow no-break space, e.g. at French words before exclamation and question marks (where narrow no-break space allows to get correct typography, if the OpenType/Graphite font doesn't have this feature). Add this and the previous fixes for Hungarian, which handled by extra word-breaking rule files. Follow-up to commit 6e002da1615b52cda4e9331e87878458b1fe9677 "tdf#161737 i18npool: fix fake spelling alarms with NNBSP". Change-Id: I7230bd356e5f0360172b652e615a61d96131d336 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/169624 Tested-by: Jenkins Reviewed-by: László Németh <nem...@numbertext.org> diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index 6fbde026f565..7e9f47ad22f1 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -1022,6 +1022,36 @@ void TestBreakIterator::testWordBoundaries() // This was 8 (word + NNBSP) CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); } + + // tdf#161737: narrow no-break space at the end of words resulted spelling mistakes + { + aLocale.Language = "hu"; + aLocale.Country = "HU"; + + OUString aTest(u"L’espace fine insécable\u202F!"_ustr); + aBounds + = m_xBreak->getWordBoundary(aTest, 14, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos); + // This was 24 (word + NNBSP) + CPPUNIT_ASSERT_EQUAL(sal_Int32(23), aBounds.endPos); + } + + // tdf#161737: narrow no-break space between digits resulted spelling mistakes + // as a quick fix, limit NBSP as word-part character only for editing, and not for spell checking + // TODO: remove NBSP by the linguistic module or by the spell checking dictionaries to allow + // to check numbers with thousand separators and with correct suffix + { + aLocale.Language = "hu"; + aLocale.Country = "HU"; + + OUString aTest(u"1\u202F000\u202F000"_ustr); + aBounds + = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false); + // This was 0 (word + NNBSP) + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + // This was 8 (word + NNBSP) + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + } } void TestBreakIterator::testSentenceBoundaries() diff --git a/i18npool/source/breakiterator/data/dict_word_hu.txt b/i18npool/source/breakiterator/data/dict_word_hu.txt index 88648e6e5716..4ba426c8c7db 100644 --- a/i18npool/source/breakiterator/data/dict_word_hu.txt +++ b/i18npool/source/breakiterator/data/dict_word_hu.txt @@ -53,7 +53,7 @@ $Double_Quote = [\p{Word_Break = Double_Quote}]; $MidNumLet = [\p{Word_Break = MidNumLet}]; $MidNum = [\p{Word_Break = MidNum}]; $Numeric = [\p{Word_Break = Numeric}]; -$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name = NARROW NO-BREAK SPACE:]]; $WSegSpace = [\p{Word_Break = WSegSpace}]; $Extended_Pict = [\p{Extended_Pictographic}]; diff --git a/i18npool/source/breakiterator/data/edit_word.txt b/i18npool/source/breakiterator/data/edit_word.txt index 14fc221aa96e..1e3bcd15b20d 100644 --- a/i18npool/source/breakiterator/data/edit_word.txt +++ b/i18npool/source/breakiterator/data/edit_word.txt @@ -65,7 +65,7 @@ $Extended_Pict = [\p{Extended_Pictographic}]; $MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]]; # $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; -$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]-[:name = NARROW NO-BREAK SPACE:]]; ### END CUSTOMIZATION @@ -164,16 +164,18 @@ $Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; $Katakana $ExFm* $Katakana {400}; # rule 13a/b +# allow to select numbers with narrow no-break spaces as thousand separators +$ExtendNumLetNNBSP = [\p{Word_Break = ExtendNumLet}]; $ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) $Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) -$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Numeric $ExFm* $ExtendNumLetNNBSP {100}; # (13a) $Katakana $ExFm* $ExtendNumLet {400}; # (13a) $ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) $ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) $ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) -$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLetNNBSP $ExFm* $Numeric {100}; # (13b) $ExtendNumLet $ExFm* $Katakana {400}; # (13b) # rules 15 - 17 diff --git a/i18npool/source/breakiterator/data/edit_word_hu.txt b/i18npool/source/breakiterator/data/edit_word_hu.txt index 389ad2bacc13..a5e44d2732d9 100644 --- a/i18npool/source/breakiterator/data/edit_word_hu.txt +++ b/i18npool/source/breakiterator/data/edit_word_hu.txt @@ -81,7 +81,7 @@ $MidLetter = [\p{Word_Break = MidLetter} $Symbols_hu]; $MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]]; # $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; -$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]-[:name = NARROW NO-BREAK SPACE:]]; ### END CUSTOMIZATION @@ -180,16 +180,18 @@ $Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; $Katakana $ExFm* $Katakana {400}; # rule 13a/b +# allow to select numbers with narrow no-break spaces as thousand separators +$ExtendNumLetNNBSP = [\p{Word_Break = ExtendNumLet}]; $ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) $Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) -$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Numeric $ExFm* $ExtendNumLetNNBSP {100}; # (13a) $Katakana $ExFm* $ExtendNumLet {400}; # (13a) $ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) $ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) $ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) -$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLetNNBSP $ExFm* $Numeric {100}; # (13b) $ExtendNumLet $ExFm* $Katakana {400}; # (13b) # rules 15 - 17