i18npool/CustomTarget_breakiterator.mk | 6 i18npool/qa/cppunit/test_breakiterator.cxx | 21 i18npool/source/breakiterator/data/dict_word.txt | 267 ++-- i18npool/source/breakiterator/data/dict_word_he.txt | 139 -- i18npool/source/breakiterator/data/dict_word_hu.txt | 324 ++--- i18npool/source/breakiterator/data/dict_word_nodash.txt | 147 -- i18npool/source/breakiterator/data/dict_word_prepostdash.txt | 288 ++-- i18npool/source/breakiterator/data/edit_word.txt | 261 ++-- i18npool/source/breakiterator/data/edit_word_he.txt | 142 -- i18npool/source/breakiterator/data/edit_word_hu.txt | 294 ++-- i18npool/source/breakiterator/data/line.txt | 680 +++-------- i18npool/source/breakiterator/data/sent.txt | 128 -- 12 files changed, 1085 insertions(+), 1612 deletions(-)
New commits: commit 587f4a8d0a031bfc99608958997a97b39e3e8314 Author: Jonathan Clark <jonat...@libreoffice.org> AuthorDate: Wed Apr 17 09:09:50 2024 -0600 Commit: Stephan Bergmann <stephan.bergm...@allotropia.de> CommitDate: Fri Sep 27 16:43:39 2024 +0200 tdf#49885 BreakIterator rule upgrades This change re-bases the BreakIterator rule customizations on top of a clean copy of the ICU 74.2 rules. Reviewed-on: https://gerrit.libreoffice.org/c/core/+/166273 Tested-by: Jenkins Tested-by: Caolán McNamara <caolan.mcnam...@collabora.com> Reviewed-by: Caolán McNamara <caolan.mcnam...@collabora.com> (cherry-picked from 44699b3de37f07090ac6fee1cd97aa76036e9700, as its modifications to i18npool/source/breakiterator/data/line.txt happen to fix a flatpak build against org.freedesktop.Sdk//24.08, which would otherwise fail with > [BRK] CustomTarget/i18npool/breakiterator/line.brk S=/run/build/libreoffice && I=$S/instdir && W=$S/workdir && /usr/bin/genbrk -r $S/i18npool/source/breakiterator/data/line.txt -o $W/CustomTarget/i18npool/breakiterator/line.brk > createRuleBasedBreakIterator: ICU Error "U_BRK_UNRECOGNIZED_OPTION" at line 17, column 14 > make: *** [/run/build/libreoffice/i18npool/CustomTarget_breakiterator.mk:92: /run/build/libreoffice/workdir/CustomTarget/i18npool/breakiterator/line.brk] Error 12 ) Conflicts: i18npool/qa/cppunit/test_breakiterator.cxx Change-Id: Iadcf16cab138cc6c869fac61ad64e996e65b5ae4 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/174053 Tested-by: Stephan Bergmann <stephan.bergm...@allotropia.de> Reviewed-by: Stephan Bergmann <stephan.bergm...@allotropia.de> diff --git a/i18npool/CustomTarget_breakiterator.mk b/i18npool/CustomTarget_breakiterator.mk index 83ddcbcefdeb..c50ea884ff50 100644 --- a/i18npool/CustomTarget_breakiterator.mk +++ b/i18npool/CustomTarget_breakiterator.mk @@ -45,16 +45,12 @@ endif i18npool_BRKTXTS := \ count_word.brk \ - $(call gb_Helper_optional_locale,he,dict_word_he.brk) \ $(call gb_Helper_optional_locale,hu,dict_word_hu.brk) \ - dict_word_nodash.brk \ dict_word_prepostdash.brk \ dict_word.brk \ - $(call gb_Helper_optional_locale,he,edit_word_he.brk) \ $(call gb_Helper_optional_locale,hu,edit_word_hu.brk) \ edit_word.brk \ - line.brk \ - sent.brk + line.brk # 'gencmn', 'genbrk' and 'genccode' are tools generated and delivered by icu project to process icu breakiterator rules. # The output of gencmn generates warnings under Windows. We want to minimize the patches to external tools, diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index 4463f46270e1..26e747b708fa 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -440,7 +440,8 @@ void TestBreakIterator::testWordBoundaries() CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i); } - //See https://bz.apache.org/ooo/show_bug.cgi?id=85411 + // i#85411: ZWSP should be a word separator for spellchecking + // - This fix was applied to both dict and edit customizations for (int j = 0; j < 3; ++j) { switch (j) @@ -462,21 +463,23 @@ void TestBreakIterator::testWordBoundaries() break; } - static constexpr OUString aTest = - u"I\u200Bwant\u200Bto\u200Bgo"_ustr; + static constexpr OUString aTest = u"I\u200Bwant\u200Bto\u200Bgo"_ustr; sal_Int32 nPos = 0; - sal_Int32 aExpected[] = {1, 6, 9, 12}; + sal_Int32 aExpected[] = { 1, 6, 9, 12 }; size_t i = 0; do { CPPUNIT_ASSERT(i < std::size(aExpected)); - nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, - i18n::WordType::DICTIONARY_WORD, true).endPos; - CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos); + auto dwPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(aExpected[i], dwPos.endPos); + auto ewPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(aExpected[i], ewPos.endPos); + nPos = dwPos.endPos; ++i; - } - while (nPos++ < aTest.getLength()); + } while (nPos++ < aTest.getLength()); CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i); } diff --git a/i18npool/source/breakiterator/data/dict_word.txt b/i18npool/source/breakiterator/data/dict_word.txt index b1666f44daab..f804b0eec214 100644 --- a/i18npool/source/breakiterator/data/dict_word.txt +++ b/i18npool/source/breakiterator/data/dict_word.txt @@ -1,148 +1,199 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: dict_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] - [:name = HYPHEN-MINUS:] ]; - -$SufixLetter = [:name= FULL STOP:]; - - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; +############################################################################## + +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION + +!!chain; +!!quoted_literals_only; + # # Character Class Definitions. -# The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; +$Han = [:Han:]; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$ALetter = [\p{Word_Break = ALetter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; +### BEGIN CUSTOMIZATION +### Unknown issue number: Dictionary words can contain hyphens +### tdf#49885: Sync custom BreakIterator rules with ICU originals +### - ICU is now more permissive about punctuation inside words. +### - For compatibility, exclude certain characters that were previously excluded. -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### +$IncludedML = [:name = HYPHEN-MINUS:]; +$ExcludedML = [[:name = COLON:] + [:name = GREEK ANO TELEIA:] + [:name = PRESENTATION FORM FOR VERTICAL COLON:] + [:name = SMALL COLON:] + [:name = FULLWIDTH COLON:]]; -$Format = [[:Cf:] - $TheZWSP]; +# $MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML]; +### END CUSTOMIZATION +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; +# TODO: check if handling of katakana in dictionary makes rules incorrect/void -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; -[[:P:][:S:]]*; +## ------------------------------------------------- +# Rule 3 - CR x LF # -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$CR $LF; +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +$ZWJ $Extended_Pict; +# Rule 3d - Keep horizontal whitespace together. # -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +$WSegSpace $WSegSpace; + +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. + +$ExFm = [$Extend $Format $ZWJ]; + +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. + +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. + +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) +# rule 5 +# Do not break between most letters. # +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; + +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; + +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +# rule 8 + +$Numeric $ExFm* $Numeric; + +# rule 9 + +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; + +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | ) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/dict_word_he.txt b/i18npool/source/breakiterator/data/dict_word_he.txt deleted file mode 100644 index 40197d92a431..000000000000 --- a/i18npool/source/breakiterator/data/dict_word_he.txt +++ /dev/null @@ -1,139 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: dict_word.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Katakana - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]]; - -$SufixLetter = [:name= FULL STOP:]; - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$FormatEx = $Format $Extend*; - - -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; - -[[:P:][:S:]]*; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -# [:IDEOGRAPHIC:] $Extend* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$Numeric $ALetter $Katakana [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; - -#!.*; -! ($NonStarters* | ) .; - diff --git a/i18npool/source/breakiterator/data/dict_word_hu.txt b/i18npool/source/breakiterator/data/dict_word_hu.txt index b0a0276b36a8..88648e6e5716 100644 --- a/i18npool/source/breakiterator/data/dict_word_hu.txt +++ b/i18npool/source/breakiterator/data/dict_word_hu.txt @@ -1,176 +1,222 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: dict_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - - -# Fix spelling of a)-ban, b)-ben, when the letter is a reference -# resulting bad word breaking "ban" and "ben" -# (reference fields are not expanded in spell checking, yet, only -# for grammar checking). - -$PrefixLetter = [[:name = RIGHT PARENTHESIS:]]; - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:] - [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] - [:name = DIGIT ZERO:] - [:name = DIGIT ONE:] - [:name = DIGIT TWO:] - [:name = DIGIT THREE:] - [:name = DIGIT FOUR:] - [:name = DIGIT FIVE:] - [:name = DIGIT SIX:] - [:name = DIGIT SEVEN:] - [:name = DIGIT EIGHT:] - [:name = DIGIT NINE:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:] - [:name = EURO SIGN:] [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = EN DASH:] [:name = EM DASH:] - [:name = RIGHT DOUBLE QUOTATION MARK:] - [:name = LEFT PARENTHESIS:] - [:name = RIGHT PARENTHESIS:] - [:name = RIGHT SQUARE BRACKET:] - [:name = EXCLAMATION MARK:] - [:name = QUESTION MARK:] - [:name = FULL STOP:] [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]]; - -$SufixLetter = [:name= FULL STOP:]; - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; +############################################################################## + +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION + +!!chain; +!!quoted_literals_only; + # # Character Class Definitions. -# The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - +$Han = [:Han:]; + +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; + +### BEGIN CUSTOMIZATION +### Unknown issue number: Dictionary words can contain hyphens +### tdf#49885: Sync custom BreakIterator rules with ICU originals +### - ICU is now more permissive about punctuation inside words. +### - For compatibility, exclude certain characters that were previously excluded. +### tdf#116072: Extend MidLetter in Hungarian word breaking +### i#56347: BreakIterator patch for Hungarian +### i#56348: Special chars in first pos not handled by spell checking for Hungarian + +$Symbols_hu = [[:name = PERCENT SIGN:] + [:name = PER MILLE SIGN:] + [:name = PER TEN THOUSAND SIGN:] + [:name = SECTION SIGN:] + [:name = DEGREE SIGN:] + [:name = EURO SIGN:] + [:name = HYPHEN-MINUS:] + [:name = EN DASH:] + [:name = EM DASH:]]; + +#$ALetter = [\p{Word_Break = ALetter}]; +$ALetter = [\p{Word_Break = ALetter} $Symbols_hu]; + +$IncludedML = [:name = HYPHEN-MINUS:]; +$ExcludedML = [[:name = COLON:] + [:name = GREEK ANO TELEIA:] + [:name = PRESENTATION FORM FOR VERTICAL COLON:] + [:name = SMALL COLON:] + [:name = FULLWIDTH COLON:]]; + +$IncludedML_hu = [[:name = RIGHT DOUBLE QUOTATION MARK:] + [:name = LEFT PARENTHESIS:] + [:name = RIGHT PARENTHESIS:] + [:name = RIGHT SQUARE BRACKET:] + [:name = EXCLAMATION MARK:] + [:name = QUESTION MARK:] + $Symbols_hu]; + +# $MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML $IncludedML_hu]; + +### END CUSTOMIZATION + +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; + + +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. + +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; + +# TODO: check if handling of katakana in dictionary makes rules incorrect/void + +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; + + +## ------------------------------------------------- + +# Rule 3 - CR x LF +# +$CR $LF; -#################################################################################### +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Word Break Rules. Definitions and Rules specific to word break begin Here. +$ZWJ $Extended_Pict; + +# Rule 3d - Keep horizontal whitespace together. # -#################################################################################### +$WSegSpace $WSegSpace; -$Format = [[:Cf:] - $TheZWSP]; +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. +$ExFm = [$Extend $Format $ZWJ]; +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +# rule 5 +# Do not break between most letters. # -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $PrefixLetter? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; -[[:P:][:S:]]*; +# rule 8 -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$Numeric $ExFm* $Numeric; -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +# rule 9 -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | ) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/dict_word_nodash.txt b/i18npool/source/breakiterator/data/dict_word_nodash.txt deleted file mode 100644 index 279cc50e5b66..000000000000 --- a/i18npool/source/breakiterator/data/dict_word_nodash.txt +++ /dev/null @@ -1,147 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: dict_word.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] ]; - -$SufixLetter = [:name= FULL STOP:]; - - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; - - -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; - -[[:P:][:S:]]*; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; - -#!.*; -! ($NonStarters* | ) .; - diff --git a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt index fb29b478af21..b39503d1b405 100644 --- a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt +++ b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt @@ -1,157 +1,221 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: dict_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; +############################################################################## -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION -# list of dashes or hyphens that should be accepted as part of the word if a single one of these -# pre- or postfixes a word. E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to -# be part of the word in order to have it properly spell checked etc. -$PrePostDashHyphen = [ [:name = HYPHEN-MINUS:] ]; +!!chain; +!!quoted_literals_only; -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] - [:name = HYPHEN-MINUS:] ]; +# +# Character Class Definitions. +# -$SufixLetter = [:name= FULL STOP:]; - +$Han = [:Han:]; -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$ALetter = [\p{Word_Break = ALetter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; +### BEGIN CUSTOMIZATION +### Unknown issue number: Dictionary words can contain hyphens +### tdf#49885: Sync custom BreakIterator rules with ICU originals +### - ICU is now more permissive about punctuation inside words. +### - For compatibility, exclude certain characters that were previously excluded. -$TheZWSP = \u200b; +$IncludedML = [:name = HYPHEN-MINUS:]; +$ExcludedML = [[:name = COLON:] + [:name = GREEK ANO TELEIA:] + [:name = PRESENTATION FORM FOR VERTICAL COLON:] + [:name = SMALL COLON:] + [:name = FULLWIDTH COLON:]]; -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; +# $MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML]; +### END CUSTOMIZATION +### BEGIN CUSTOMIZATION +### Unknown issue number: Allow leading and trailing hyphens in certain languages +### This part of the customization does not replace any rules. +$PrePostHyphen = [:name = HYPHEN-MINUS:]; -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### +### END CUSTOMIZATION -$Format = [[:Cf:] - $TheZWSP]; +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; +# TODO: check if handling of katakana in dictionary makes rules incorrect/void -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; +## ------------------------------------------------- + +# Rule 3 - CR x LF # -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +$CR $LF; +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. +$ZWJ $Extended_Pict; + +# Rule 3d - Keep horizontal whitespace together. # -# At most one leading or trailing dash/hyphen should be accepted as well. -# E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to -# be part of the word in order to have it properly spell checked etc. -$LetterSequence = $PrePostDashHyphen? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)* $PrePostDashHyphen?; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; +$WSegSpace $WSegSpace; -[[:P:][:S:]]*; +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$ExFm = [$Extend $Format $ZWJ]; -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. + +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) +# rule 5 +# Do not break between most letters. # -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +### BEGIN CUSTOMIZATION +### Unknown issue number: Allow leading and trailing hyphens in certain languages + +# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); +($PrePostHyphen) ? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)?; + +### END CUSTOMIZATION + +# rule 6 and 7 + +### BEGIN CUSTOMIZATION +### Unknown issue number: Allow leading and trailing hyphens in certain languages + +# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; +($PrePostHyphen)? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)? {200}; + +### END CUSTOMIZATION + +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; + +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +# rule 8 + +$Numeric $ExFm* $Numeric; + +# rule 9 + +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; + +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) + +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | ) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/edit_word.txt b/i18npool/source/breakiterator/data/edit_word.txt index 92b344c19d41..14fc221aa96e 100644 --- a/i18npool/source/breakiterator/data/edit_word.txt +++ b/i18npool/source/breakiterator/data/edit_word.txt @@ -1,142 +1,199 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: edit_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]]; - -$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; +############################################################################## + +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION + +!!chain; +!!quoted_literals_only; + # # Character Class Definitions. -# The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; +$Han = [:Han:]; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$ALetter = [\p{Word_Break = ALetter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidLetter = [\p{Word_Break = MidLetter}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; +### BEGIN CUSTOMIZATION +### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. +### This change subtracts undesired characters from the above families -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### +# $MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]]; -$Format = [[:Cf:] - $TheZWSP]; +# $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]]; +### END CUSTOMIZATION +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; +# TODO: check if handling of katakana in dictionary makes rules incorrect/void -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; -# Punctuations by themselves -[[:P:][:S:]-[:name = FULL STOP:]]*; -[[:name = FULL STOP:]]*; +## ------------------------------------------------- +# Rule 3 - CR x LF # -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$CR $LF; +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +$ZWJ $Extended_Pict; +# Rule 3d - Keep horizontal whitespace together. # -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +$WSegSpace $WSegSpace; + +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. + +$ExFm = [$Extend $Format $ZWJ]; + +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. + +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. + +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) +# rule 5 +# Do not break between most letters. # +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; + +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; + +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +# rule 8 + +$Numeric $ExFm* $Numeric; + +# rule 9 + +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) + +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | ) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found + +### BEGIN CUSTOMIZATION +### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. +### This customization does not replace any rules. +[[:P:][:S:]-[:name = FULL STOP:]]* +[[:name = FULL STOP:]]*; +### END CUSTOMIZATION +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/edit_word_he.txt b/i18npool/source/breakiterator/data/edit_word_he.txt deleted file mode 100644 index 0b5908814e08..000000000000 --- a/i18npool/source/breakiterator/data/edit_word_he.txt +++ /dev/null @@ -1,142 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: edit_word.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]]; - -$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; - - -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; - -# Punctuations by themselves -[[:P:][:S:]-[:name = FULL STOP:]]*; -[[:name = FULL STOP:]]*; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; - -#!.*; -! ($NonStarters* | ) .; - diff --git a/i18npool/source/breakiterator/data/edit_word_hu.txt b/i18npool/source/breakiterator/data/edit_word_hu.txt index 4a08acab0029..389ad2bacc13 100644 --- a/i18npool/source/breakiterator/data/edit_word_hu.txt +++ b/i18npool/source/breakiterator/data/edit_word_hu.txt @@ -1,159 +1,215 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: edit_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] - [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:] - [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] - [:name = DIGIT ZERO:] - [:name = DIGIT ONE:] - [:name = DIGIT TWO:] - [:name = DIGIT THREE:] - [:name = DIGIT FOUR:] - [:name = DIGIT FIVE:] - [:name = DIGIT SIX:] - [:name = DIGIT SEVEN:] - [:name = DIGIT EIGHT:] - [:name = DIGIT NINE:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] - [:name = HYPHEN-MINUS:] [:name = EURO SIGN:] [:name = PERCENT SIGN:] - [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = EN DASH:] [:name = EM DASH:] - [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]]; - -$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; +############################################################################## + +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION + +!!chain; +!!quoted_literals_only; + # # Character Class Definitions. -# The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; +$Han = [:Han:]; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; +### BEGIN CUSTOMIZATION +### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. +### This change subtracts undesired characters from the above families +### i#56347: BreakIterator patch for Hungarian +### i#56348: Special chars in first pos not handled by spell checking for Hungarian -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### +$Symbols_hu = [[:name = PERCENT SIGN:] + [:name = PER MILLE SIGN:] + [:name = PER TEN THOUSAND SIGN:] + [:name = SECTION SIGN:] + [:name = DEGREE SIGN:] + [:name = EURO SIGN:] + [:name = HYPHEN-MINUS:] + [:name = EN DASH:] + [:name = EM DASH:]]; -$Format = [[:Cf:] - $TheZWSP]; +# $ALetter = [\p{Word_Break = ALetter}]; +$ALetter = [\p{Word_Break = ALetter} $Symbols_hu]; +# $MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [\p{Word_Break = MidLetter} $Symbols_hu]; +# $MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]]; -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +# $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]]; +### END CUSTOMIZATION -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; -# Punctuations by themselves -[[:P:][:S:]-[:name = FULL STOP:]]*; -[[:name = FULL STOP:]]*; +# TODO: check if handling of katakana in dictionary makes rules incorrect/void -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; + +## ------------------------------------------------- + +# Rule 3 - CR x LF # -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +$CR $LF; +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. +$ZWJ $Extended_Pict; + +# Rule 3d - Keep horizontal whitespace together. # -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +$WSegSpace $WSegSpace; + +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. + +$ExFm = [$Extend $Format $ZWJ]; + +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. + +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. + +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) +# rule 5 +# Do not break between most letters. # +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; + +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; + +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +# rule 8 + +$Numeric $ExFm* $Numeric; + +# rule 9 -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; + +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) + +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | ) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found + +### BEGIN CUSTOMIZATION +### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. +### This customization does not replace any rules. +[[:P:][:S:]-[:name = FULL STOP:]]* +[[:name = FULL STOP:]]*; +### END CUSTOMIZATION +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/line.txt b/i18npool/source/breakiterator/data/line.txt index ff3f3eafc42e..46a618c63cae 100644 --- a/i18npool/source/breakiterator/data/line.txt +++ b/i18npool/source/breakiterator/data/line.txt @@ -1,176 +1,116 @@ -# Copyright (c) 2002-2006 International Business Machines Corporation and +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. # # file: line.txt # # Line Breaking Rules -# Implement default line breaking as defined by Unicode Standard Annex #14 version 5.0.0 -# http://www.unicode.org/reports/tr14/ - - +# Implement default line breaking as defined by +# Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/) +# for Unicode 14.0, with the following modification: +# +# Boundaries between hyphens and following letters are suppressed when +# there is a boundary preceding the hyphen. See rule 20.9 +# +# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict). +# It sets characters of class CJ to behave like NS. # # Character Classes defined by TR 14. # -!!chain; -!!LBCMNoChain; +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION - -!!lookAheadHardBreak; -# -# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere -# and only used for the line break rules. -# -# It is used in the implementation of the incredibly annoying rule LB 10 -# which says to treat any combining mark that is not attached to a base -# character as if it were of class AL (alphabetic). -# -# The problem occurs in the reverse rules. -# -# Consider a sequence like, with correct breaks as shown -# LF ID CM AL AL -# ^ ^ ^ -# Then consider the sequence without the initial ID (ideographic) -# LF CM AL AL -# ^ ^ -# Our CM, which in the first example was attached to the ideograph, -# is now unattached, becomes an alpha, and joins in with the other -# alphas. -# -# When iterating forwards, these sequences do not present any problems -# When iterating backwards, we need to look ahead when encountering -# a CM to see whether it attaches to something further on or not. -# (Look-ahead in a reverse rule is looking towards the start) -# -# If the CM is unattached, we need to force a break. -# -# !!lookAheadHardBreak forces the run time state machine to -# stop immediately when a look ahead rule ( '/' operator) matches, -# and set the match position to that of the look-ahead operator, -# no matter what other rules may be in play at the time. -# -# See rule LB 19 for an example. -# +!!chain; +!!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; -$DG = \u00B0; -$AL = [[:LineBreak = Alphabetic:] $DG]; +$AL = [:LineBreak = Alphabetic:]; $BA = [:LineBreak = Break_After:]; +$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; $B2 = [:LineBreak = Break_Both:]; $CB = [:LineBreak = Contingent_Break:]; $CJ = [:LineBreak = Conditional_Japanese_Starter:]; -$CL = [[:LineBreak = Close_Punctuation:] [:LineBreak = Close_Parenthesis:]]; # tdf#31271 -$CM = [:LineBreak = Combining_Mark:]; +$CL = [:LineBreak = Close_Punctuation:]; +# $CM = [:LineBreak = Combining_Mark:]; +$CP = [:LineBreak = Close_Parenthesis:]; $CR = [:LineBreak = Carriage_Return:]; +$EB = [:LineBreak = EB:]; +$EM = [:LineBreak = EM:]; $EX = [:LineBreak = Exclamation:]; $GL = [:LineBreak = Glue:]; $HL = [:LineBreak = Hebrew_Letter:]; $HY = [:LineBreak = Hyphen:]; $H2 = [:LineBreak = H2:]; $H3 = [:LineBreak = H3:]; -$ID = [[:LineBreak = Ideographic:] - [\ufe30]]; -$IN = [:LineBreak = Inseparable:]; -$IS = [[:LineBreak = Infix_Numeric:] [\ufe30]]; +$ID = [:LineBreak = Ideographic:]; +$IN = [:LineBreak = Inseperable:]; +$IS = [:LineBreak = Infix_Numeric:]; $JL = [:LineBreak = JL:]; $JV = [:LineBreak = JV:]; $JT = [:LineBreak = JT:]; $LF = [:LineBreak = Line_Feed:]; $NL = [:LineBreak = Next_Line:]; +# NS includes CJ for CSS strict line breaking. $NS = [[:LineBreak = Nonstarter:] $CJ]; $NU = [:LineBreak = Numeric:]; -$OP = [[:LineBreak = Open_Punctuation:] - $DG]; +$OP = [:LineBreak = Open_Punctuation:]; $PO = [:LineBreak = Postfix_Numeric:]; -$BS = \u005C; -$PR = [[:LineBreak = Prefix_Numeric:] - $BS]; +$PR = [:LineBreak = Prefix_Numeric:]; $QU = [:LineBreak = Quotation:]; +$RI = [:LineBreak = Regional_Indicator:]; $SA = [:LineBreak = Complex_Context:]; $SG = [:LineBreak = Surrogate:]; $SP = [:LineBreak = Space:]; -$SY = [[:LineBreak = Break_Symbols:] $BS]; +$SY = [:LineBreak = Break_Symbols:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; +$ZWJ = [:LineBreak = ZWJ:]; + +# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14, +# without a formal name. Because ICU rules require multiple uses of the expressions, +# give them a single definition with a name + +$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + +$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}]; + +# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly +# list it in the numerous rules that use CM. +# By LB1, SA characters with general categor of Mn or Mc also resolve to CM. + +$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]]; +$CMX = [[$CM] - [$ZWJ]]; # Dictionary character set, for triggering language-based break engines. Currently -# limited to LineBreak=Complex_Context. Note that this set only works in Unicode -# 5.0 or later as the definition of Complex_Context was corrected to include all -# characters requiring dictionary break. +# limited to LineBreak=Complex_Context (SA). -$dictionary = [:LineBreak = Complex_Context:]; +$dictionary = [$SA]; # # Rule LB1. By default, treat AI (characters with ambiguous east Asian width), -# SA (South East Asian: Thai, Lao, Khmer) +# SA (Dictionary chars, excluding Mn and Mc) # SG (Unpaired Surrogates) # XX (Unknown, unassigned) # as $AL (Alphabetic) # -$ALPlus = [$AL $AI $SA $SG $XX]; - -# -# Combining Marks. X $CM* behaves as if it were X. Rule LB6. -# -$ALcm = $ALPlus $CM*; -$BAcm = $BA $CM*; -$BBcm = $BB $CM*; -$B2cm = $B2 $CM*; -$CLcm = $CL $CM*; -$EXcm = $EX $CM*; -$GLcm = $GL $CM*; -$HLcm = $HL $CM*; -$HYcm = $HY $CM*; -$H2cm = $H2 $CM*; -$H3cm = $H3 $CM*; -$IDcm = $ID $CM*; -$INcm = $IN $CM*; -$IScm = $IS $CM*; -$JLcm = $JL $CM*; -$JVcm = $JV $CM*; -$JTcm = $JT $CM*; -$NScm = $NS $CM*; -$NUcm = $NU $CM*; -$OPcm = $OP $CM*; -$POcm = $PO $CM*; -$PRcm = $PR $CM*; -$QUcm = $QU $CM*; -$SYcm = $SY $CM*; -$WJcm = $WJ $CM*; +$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]]; -## ------------------------------------------------- -!!forward; - -# -# Each class of character can stand by itself as an unbroken token, with trailing combining stuff -# -$ALPlus $CM+; -$BA $CM+; -$BB $CM+; -$B2 $CM+; -$CL $CM+; -$EX $CM+; -$GL $CM+; -$HL $CM+; -$HY $CM+; -$H2 $CM+; -$H3 $CM+; -$ID $CM+; -$IN $CM+; -$IS $CM+; -$JL $CM+; -$JV $CM+; -$JT $CM+; -$NS $CM+; -$NU $CM+; -$OP $CM+; -$PO $CM+; -$PR $CM+; -$QU $CM+; -$SY $CM+; -$WJ $CM+; +## ------------------------------------------------- # # CAN_CM is the set of characters that may combine with CM combining chars. @@ -186,19 +126,15 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs # # AL_FOLLOW set of chars that can unconditionally follow an AL # Needed in rules where stand-alone $CM s are treated as AL. -# Chaining is disabled with CM because it causes other failures, -# so for this one case we need to manually list out longer sequences. # -$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP]; -$AL_FOLLOW_CM = [$CL $EX $HL $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus $OP]; -$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; +$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; # # Rule LB 4, 5 Mandatory (Hard) breaks. # $LB4Breaks = [$BK $CR $LF $NL]; -$LB4NonBreaks = [^$BK $CR $LF $NL]; +$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; $CR $LF {100}; # @@ -206,91 +142,124 @@ $CR $LF {100}; # $LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. $CAN_CM $CM* $LB4Breaks {100}; -$CM+ $LB4Breaks {100}; +^$CM+ $LB4Breaks {100}; # LB 7 x SP # x ZW $LB4NonBreaks [$SP $ZW]; $CAN_CM $CM* [$SP $ZW]; -$CM+ [$SP $ZW]; +^$CM+ [$SP $ZW]; # # LB 8 Break after zero width space +# ZW SP* ÷ # $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +$ZW $SP* / [^$SP $ZW $LB4Breaks]; +# LB 8a ZWJ x Do not break Emoji ZWJ sequences. +# +$ZWJ [^$CM]; -# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -# $CM not covered by the above needs to behave like $AL +# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -e ... etc. - the rest is truncated