i18npool/qa/cppunit/test_breakiterator.cxx | 18 ++++++++++++++ lingucomponent/source/spellcheck/spell/sspellimp.cxx | 23 +++++++++++++++---- sw/qa/extras/layout/layout2.cxx | 20 ++++++++++++++++ 3 files changed, 56 insertions(+), 5 deletions(-)
New commits: commit 3fdf13e7f9e0e053d11bfe477407ec9331160ef2 Author: László Németh <[email protected]> AuthorDate: Sat Dec 27 13:40:51 2025 +0100 Commit: Adolfo Jayme Barrientos <[email protected]> CommitDate: Tue Dec 30 22:36:36 2025 +0100 tdf#170140 lingucomponent: check words with non-ASCII apostrophe If the spelling dictionary contains them with non-ASCII apostrophe. Previously the words were checked only with apostrophes converted to their ASCII version, resulting continuous false alarms, despite the correct orthography in the document and in the spelling dictionary. Now the words are checked both with ASCII conversion and with the original non-ASCII apostrophes (because still there are UTF-8 dictionaries with ASCII apostrophes, moreover, dictionaries containing mixed apostrophes in the same dic file). Example words for the Hungarian dictionary: d’Arc, d’Alembert, McDonald’s (and their several recognized suffixed forms: d’Arcért, d’Arckal, d’Arcként, d’Arcnak, d’Arcot etc. etc.) Add unit tests for 1) break iterator and 2) spell checking with non-ASCII apostrophes. Change-Id: I24a570df41fa5aba2e7b67dde0db33377717dc2a Reviewed-on: https://gerrit.libreoffice.org/c/core/+/196248 Reviewed-by: László Németh <[email protected]> Tested-by: Jenkins Signed-off-by: Xisco Fauli <[email protected]> Reviewed-on: https://gerrit.libreoffice.org/c/core/+/196283 (cherry picked from commit 10d08d2ffead7597a44e0ce0cf4c5e98ffc29b22) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/196309 Reviewed-by: Adolfo Jayme Barrientos <[email protected]> diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index ae2e5af4e5e9..c91532d5ff3a 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -1844,7 +1844,18 @@ void TestBreakIterator::testDictWordAbbreviationHU() for (const auto& rLocale : aLocale) { - auto aTest = u"Pl. stb. dr.-ral Mo.-gal 50-et 50.-et"_ustr; + + auto aTest = + // abbreviations + u"Pl. stb. " + // abbreviations with suffixes + "dr.-ral Mo.-gal " + // number with a suffix + "50-et " + // ordinal number with a suffix + "50.-et " + // word with a non-ASCII apostrophe + "d’Arc"_ustr; i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 1, rLocale, i18n::WordType::DICTIONARY_WORD, false); @@ -1875,6 +1886,11 @@ void TestBreakIterator::testDictWordAbbreviationHU() = m_xBreak->getWordBoundary(aTest, 31, rLocale, i18n::WordType::DICTIONARY_WORD, false); CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 38, rLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(38), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(43), aBounds.endPos); } } diff --git a/lingucomponent/source/spellcheck/spell/sspellimp.cxx b/lingucomponent/source/spellcheck/spell/sspellimp.cxx index fe676cde5312..153ce18ec2be 100644 --- a/lingucomponent/source/spellcheck/spell/sspellimp.cxx +++ b/lingucomponent/source/spellcheck/spell/sspellimp.cxx @@ -243,6 +243,7 @@ sal_Bool SAL_CALL SpellChecker::hasLocale(const Locale& rLocale) return bRes; } +#define SPELL_NON_ASCII_APOSTROPHE 1 << 10 sal_Int16 SpellChecker::GetSpellFailure(const OUString &rWord, const Locale &rLocale, int& rInfo) { if (rWord.getLength() > MAXWORDLEN) @@ -261,15 +262,19 @@ sal_Int16 SpellChecker::GetSpellFailure(const OUString &rWord, const Locale &rLo sal_Int32 n = rBuf.getLength(); sal_Unicode c; sal_Int32 extrachar = 0; - + const bool bDoNotConvertApostrophe = bool(rInfo & SPELL_NON_ASCII_APOSTROPHE); + bool bHasNonASCIIApostrophe = false; + rInfo = 0; for (sal_Int32 ix=0; ix < n; ix++) { c = rBuf[ix]; if ((c == 0x201C) || (c == 0x201D)) rBuf[ix] = u'"'; - else if ((c == 0x2018) || (c == 0x2019)) + else if (!bDoNotConvertApostrophe && ((c == 0x2018) || (c == 0x2019))) + { rBuf[ix] = u'\''; - + bHasNonASCIIApostrophe = true; + } // recognize words with Unicode ligatures and ZWNJ/ZWJ characters (only // with 8-bit encoded dictionaries. For UTF-8 encoded dictionaries // set ICONV and IGNORE aff file options, if needed.) @@ -370,6 +375,10 @@ sal_Int16 SpellChecker::GetSpellFailure(const OUString &rWord, const Locale &rLo } } + // checked with apostrophe conversion + if ( bHasNonASCIIApostrophe ) + rInfo |= SPELL_NON_ASCII_APOSTROPHE; + return nRes; } @@ -396,8 +405,14 @@ sal_Bool SAL_CALL SpellChecker::isValid( const OUString& rWord, const Locale& rL PropertyHelper_Spelling& rHelper = GetPropHelper(); rHelper.SetTmpPropVals( rProperties ); - int nInfo = 0; + int nInfo = 0; // return compound information, disable apostrophe conversion sal_Int16 nFailure = GetSpellFailure( rWord, rLocale, nInfo ); + // it contains non-ASCII apostrophe, and it was bad with ASCII conversion: + // check the word with the original apostrophe character(s), too + if ( nFailure != -1 && nInfo & SPELL_NON_ASCII_APOSTROPHE ) { + nInfo = SPELL_NON_ASCII_APOSTROPHE; // disable apostrophe conversion + nFailure = GetSpellFailure( rWord, rLocale, nInfo ); + } if (nFailure != -1 && !rWord.match(SPELL_XML, 0)) { LanguageType nLang = LinguLocaleToLanguage( rLocale ); diff --git a/sw/qa/extras/layout/layout2.cxx b/sw/qa/extras/layout/layout2.cxx index a5ba9d7f9399..9db240eb85c3 100644 --- a/sw/qa/extras/layout/layout2.cxx +++ b/sw/qa/extras/layout/layout2.cxx @@ -1275,6 +1275,26 @@ CPPUNIT_TEST_FIXTURE(SwLayoutWriter2, testTdf158885_not_compound_remain) u"lenes emberellenes emberellenes emberellenes emberellenes emberellenes "); } +// TODO: move this test to the lingucomponent project +CPPUNIT_TEST_FIXTURE(SwLayoutWriter2, testTdf170140) +{ + uno::Reference<linguistic2::XSpellChecker1> xSpell = LinguMgr::GetSpellChecker(); + auto aLocale = lang::Locale(u"hu"_ustr, u"HU"_ustr, OUString()); + LanguageType eLang = LanguageTag::convertToLanguageType(aLocale); + if (!xSpell.is() || !xSpell->hasLanguage(static_cast<sal_uInt16>(eLang))) + return; + + uno::Sequence<beans::PropertyValue> aProperties; + + // correct non-ASCII apostrophe + OUString sWord(u"d’Arc"_ustr); + CPPUNIT_ASSERT(xSpell->isValid(sWord, static_cast<sal_uInt16>(eLang), aProperties)); + + // bad ASCII apostrophe + OUString sWord2(u"d'Arc"_ustr); + CPPUNIT_ASSERT(!xSpell->isValid(sWord2, static_cast<sal_uInt16>(eLang), aProperties)); +} + CPPUNIT_TEST_FIXTURE(SwLayoutWriter2, testRedlineNumberInFootnote) { createSwDoc("tdf85610.fodt");
