source

Jonathan Clark (via logerrit) Fri, 27 Sep 2024 07:44:09 -0700

 i18npool/CustomTarget_breakiterator.mk                       |    6 
 i18npool/qa/cppunit/test_breakiterator.cxx                   |   21 
 i18npool/source/breakiterator/data/dict_word.txt             |  267 ++--
 i18npool/source/breakiterator/data/dict_word_he.txt          |  139 --
 i18npool/source/breakiterator/data/dict_word_hu.txt          |  324 ++---
 i18npool/source/breakiterator/data/dict_word_nodash.txt      |  147 --
 i18npool/source/breakiterator/data/dict_word_prepostdash.txt |  288 ++--
 i18npool/source/breakiterator/data/edit_word.txt             |  261 ++--
 i18npool/source/breakiterator/data/edit_word_he.txt          |  142 --
 i18npool/source/breakiterator/data/edit_word_hu.txt          |  294 ++--
 i18npool/source/breakiterator/data/line.txt                  |  680 +++--------
 i18npool/source/breakiterator/data/sent.txt                  |  128 --
 12 files changed, 1085 insertions(+), 1612 deletions(-)


New commits:
commit 587f4a8d0a031bfc99608958997a97b39e3e8314
Author:     Jonathan Clark <jonat...@libreoffice.org>
AuthorDate: Wed Apr 17 09:09:50 2024 -0600
Commit:     Stephan Bergmann <stephan.bergm...@allotropia.de>
CommitDate: Fri Sep 27 16:43:39 2024 +0200

    tdf#49885 BreakIterator rule upgrades
    
    This change re-bases the BreakIterator rule customizations on top of a
    clean copy of the ICU 74.2 rules.
    
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/166273
    Tested-by: Jenkins
    Tested-by: Caolán McNamara <caolan.mcnam...@collabora.com>
    Reviewed-by: Caolán McNamara <caolan.mcnam...@collabora.com>
    
    (cherry-picked from 44699b3de37f07090ac6fee1cd97aa76036e9700, as its
    modifications to i18npool/source/breakiterator/data/line.txt happen to fix a
    flatpak build against org.freedesktop.Sdk//24.08, which would otherwise fail
    with
    
    > [BRK] CustomTarget/i18npool/breakiterator/line.brk
    S=/run/build/libreoffice && I=$S/instdir && W=$S/workdir &&  
/usr/bin/genbrk  -r $S/i18npool/source/breakiterator/data/line.txt -o 
$W/CustomTarget/i18npool/breakiterator/line.brk
    > createRuleBasedBreakIterator: ICU Error "U_BRK_UNRECOGNIZED_OPTION"  at 
line 17, column 14
    > make: *** 
[/run/build/libreoffice/i18npool/CustomTarget_breakiterator.mk:92: 
/run/build/libreoffice/workdir/CustomTarget/i18npool/breakiterator/line.brk] 
Error 12
    )
    Conflicts:
            i18npool/qa/cppunit/test_breakiterator.cxx
    
    Change-Id: Iadcf16cab138cc6c869fac61ad64e996e65b5ae4
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/174053
    Tested-by: Stephan Bergmann <stephan.bergm...@allotropia.de>
    Reviewed-by: Stephan Bergmann <stephan.bergm...@allotropia.de>

diff --git a/i18npool/CustomTarget_breakiterator.mk 
b/i18npool/CustomTarget_breakiterator.mk
index 83ddcbcefdeb..c50ea884ff50 100644
--- a/i18npool/CustomTarget_breakiterator.mk
+++ b/i18npool/CustomTarget_breakiterator.mk
@@ -45,16 +45,12 @@ endif
 
 i18npool_BRKTXTS := \
     count_word.brk \
-    $(call gb_Helper_optional_locale,he,dict_word_he.brk) \
     $(call gb_Helper_optional_locale,hu,dict_word_hu.brk) \
-    dict_word_nodash.brk \
     dict_word_prepostdash.brk \
     dict_word.brk \
-    $(call gb_Helper_optional_locale,he,edit_word_he.brk) \
     $(call gb_Helper_optional_locale,hu,edit_word_hu.brk) \
     edit_word.brk \
-    line.brk \
-    sent.brk
+    line.brk
 
 # 'gencmn', 'genbrk' and 'genccode' are tools generated and delivered by icu 
project to process icu breakiterator rules.
 # The output of gencmn generates warnings under Windows. We want to minimize 
the patches to external tools,
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx 
b/i18npool/qa/cppunit/test_breakiterator.cxx
index 4463f46270e1..26e747b708fa 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -440,7 +440,8 @@ void TestBreakIterator::testWordBoundaries()
         CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
     }
 
-    //See https://bz.apache.org/ooo/show_bug.cgi?id=85411
+    // i#85411: ZWSP should be a word separator for spellchecking
+    // - This fix was applied to both dict and edit customizations
     for (int j = 0; j < 3; ++j)
     {
         switch (j)
@@ -462,21 +463,23 @@ void TestBreakIterator::testWordBoundaries()
                 break;
         }
 
-        static constexpr OUString aTest =
-            u"I\u200Bwant\u200Bto\u200Bgo"_ustr;
+        static constexpr OUString aTest = u"I\u200Bwant\u200Bto\u200Bgo"_ustr;
 
         sal_Int32 nPos = 0;
-        sal_Int32 aExpected[] = {1, 6, 9, 12};
+        sal_Int32 aExpected[] = { 1, 6, 9, 12 };
         size_t i = 0;
         do
         {
             CPPUNIT_ASSERT(i < std::size(aExpected));
-            nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
-                i18n::WordType::DICTIONARY_WORD, true).endPos;
-            CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
+            auto dwPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
+                                                   
i18n::WordType::DICTIONARY_WORD, true);
+            CPPUNIT_ASSERT_EQUAL(aExpected[i], dwPos.endPos);
+            auto ewPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
+                                                   
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+            CPPUNIT_ASSERT_EQUAL(aExpected[i], ewPos.endPos);
+            nPos = dwPos.endPos;
             ++i;
-        }
-        while (nPos++ < aTest.getLength());
+        } while (nPos++ < aTest.getLength());
         CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
     }
 
diff --git a/i18npool/source/breakiterator/data/dict_word.txt 
b/i18npool/source/breakiterator/data/dict_word.txt
index b1666f44daab..f804b0eec214 100644
--- a/i18npool/source/breakiterator/data/dict_word.txt
+++ b/i18npool/source/breakiterator/data/dict_word.txt
@@ -1,148 +1,199 @@
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (C) 2002-2016, International Business Machines Corporation
+# and others. All Rights Reserved.
 #
-#   file:  dict_word.txt   
+# file:  word.txt
 #
-#   ICU Word Break Rules
+# ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
 
-
-
-####################################################################################
+##############################################################################
 #
 #  Character class definitions from TR 29
 #
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW 
PUNCTUATION GERESH:]
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = 
SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL 
LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:] [:name = PRIME:] 
-              [:name = HYPHEN-MINUS:] ];
-
-$SufixLetter = [:name= FULL STOP:];
-              
-
-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 
[:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE 
QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
-             [:name = PRIME:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
+##############################################################################
+
+### BEGIN CUSTOMIZATION
+### This file contains LibreOffice-specific rule customizations.
+###
+### To aid future maintainability:
+### - The change location should be bracketed by comments of this form.
+### - The original rule should be commented out, and the modified rule placed 
alongside.
+### - By doing this, maintainers can more easily compare to an upstream 
baseline.
+###
+### END CUSTOMIZATION
+
+!!chain;
+!!quoted_literals_only;
+
 
 #
 #  Character Class Definitions.
-#    The names are those from TR29.
 #
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
 
+$Han                = [:Han:];
 
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}-$Han];
+$ZWJ                = [\p{Word_Break = ZWJ}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$ALetter            = [\p{Word_Break = ALetter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$WSegSpace          = [\p{Word_Break = WSegSpace}];
+$Extended_Pict      = [\p{Extended_Pictographic}];
 
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Dictionary words can contain hyphens
+### tdf#49885: Sync custom BreakIterator rules with ICU originals
+### - ICU is now more permissive about punctuation inside words.
+### - For compatibility, exclude certain characters that were previously 
excluded.
 
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
+$IncludedML         = [:name = HYPHEN-MINUS:];
+$ExcludedML         = [[:name = COLON:]
+                       [:name = GREEK ANO TELEIA:]
+                       [:name = PRESENTATION FORM FOR VERTICAL COLON:]
+                       [:name = SMALL COLON:]
+                       [:name = FULLWIDTH COLON:]];
 
-$Format    = [[:Cf:] - $TheZWSP];
+# $MidLetter          = [\p{Word_Break = MidLetter}];
+$MidLetter          = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML];
 
+### END CUSTOMIZATION
 
+$Hiragana           = [:Hiragana:];
+$Ideographic        = [\p{Ideographic}];
 
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
 
+#   Dictionary character set, for triggering language-based break engines. 
Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in 
Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include 
all
+#   characters requiring dictionary break.
 
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$SufixLetterEx= $SufixLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$KanaKanji $HangulSyllable];
+$dictionary     = [$ComplexContext $dictionaryCJK];
 
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
 
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* 
$ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* $SufixLetterEx? {200};
 
-[[:P:][:S:]]*;
+## -------------------------------------------------
 
+# Rule 3 - CR x LF
 #
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
+$CR $LF;
 
+# Rule 3c   Do not break within emoji zwj sequences.
+#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no 
intervening Extend chars allowed.
 #
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
+$ZWJ $Extended_Pict;
 
+# Rule 3d - Keep horizontal whitespace together.
 #
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
+$WSegSpace $WSegSpace;
+
+# Rule 4 - ignore Format and Extend characters, except when they appear at the 
beginning
+#          of a region of Text.
+
+$ExFm  = [$Extend $Format $ZWJ];
+
+^$ExFm+;            # This rule fires only when there are format or extend 
characters at the
+                    # start of text, or immediately following another 
boundary. It groups them, in
+                    # the event there are more than one.
+
+[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing 
format/extends to words,
+                                    # with no special rule status value.
+
+$Numeric $ExFm* {100};              # This group of rules also attach trailing 
format/extends, but
+$ALetterPlus $ExFm* {200};          # with rule status set based on the word's 
final base character.
+$HangulSyllable {200};
+$Hebrew_Letter $ExFm* {200};
+$Katakana $ExFm* {400};             # note:  these status values override 
those from rule 5
+$Hiragana $ExFm* {400};             #        by virtue of being numerically 
larger.
+$Ideographic $ExFm* {400};          #
 
 #
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
+# rule 5
+#    Do not break between most letters.
 #
+($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 6 and 7
+($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
+
+# rule 7a
+$Hebrew_Letter $ExFm* $Single_Quote {200};
+
+# rule 7b and 7c
+$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
+
+# rule 8
+
+$Numeric $ExFm* $Numeric;
+
+# rule 9
+
+($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
+
+# rule 10
+
+$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 11 and 12
+
+$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
+
+# rule 13
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$Katakana $ExFm*  $Katakana {400};
+
+# rule 13a/b
+
+$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
+$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
+$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
+$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
+$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
+
+$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
+$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
 
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
+# rules 15 - 17
+#    Pairs of Regional Indicators stay together.
+#    With incoming rule chaining disabled by ^, this rule will match exactly 
two of them.
+#    No other rule begins with a Regional_Indicator, so chaining cannot extend 
the match.
 #
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $SufixLetter $Extend $Format];
+^$Regional_Indicator $ExFm* $Regional_Indicator;
 
-#!.*;
-! ($NonStarters* | 
  ) .;
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji 
found
 
+# Rule 999
+#     Match a single code point if no other rule applies.
+.;
diff --git a/i18npool/source/breakiterator/data/dict_word_he.txt 
b/i18npool/source/breakiterator/data/dict_word_he.txt
deleted file mode 100644
index 40197d92a431..000000000000
--- a/i18npool/source/breakiterator/data/dict_word_he.txt
+++ /dev/null
@@ -1,139 +0,0 @@
-#
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
-#
-#   file:  dict_word.txt   
-#
-#   ICU Word Break Rules
-#      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
-#
-
-
-
-####################################################################################
-#
-#  Character class definitions from TR 29
-#
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-
-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW 
PUNCTUATION GERESH:]
-                           - $Katakana
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = GRAVE 
ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK 
TONOS:] [:name= FULL STOP:]
-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL 
LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]];  
-              
-$SufixLetter = [:name= FULL STOP:];
-
-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 
[:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE 
QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
-             [:name = PRIME:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
-
-#
-#  Character Class Definitions.
-#    The names are those from TR29.
-#
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
-
-
-
-
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
-
-$Format    = [[:Cf:] - $TheZWSP];
-
-
-
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
-
-
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$SufixLetterEx= $SufixLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$FormatEx     = $Format    $Extend*;
-
-
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
-
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* 
$ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* $SufixLetterEx? {200};
-
-[[:P:][:S:]]*;
-
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
-
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-# [:IDEOGRAPHIC:] $Extend* {400};
-
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
-
-#
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
-#
-
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
-#
-$NonStarters = [$Numeric $ALetter $Katakana [:P:] [:S:] $MidLetter $MidNum 
$SufixLetter $Extend $Format];
-
-#!.*;
-! ($NonStarters* | 
  ) .;
-
diff --git a/i18npool/source/breakiterator/data/dict_word_hu.txt 
b/i18npool/source/breakiterator/data/dict_word_hu.txt
index b0a0276b36a8..88648e6e5716 100644
--- a/i18npool/source/breakiterator/data/dict_word_hu.txt
+++ b/i18npool/source/breakiterator/data/dict_word_hu.txt
@@ -1,176 +1,222 @@
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (C) 2002-2016, International Business Machines Corporation
+# and others. All Rights Reserved.
 #
-#   file:  dict_word.txt   
+# file:  word.txt
 #
-#   ICU Word Break Rules
+# ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
 
-
-
-####################################################################################
+##############################################################################
 #
 #  Character class definitions from TR 29
 #
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-
-# Fix spelling of a)-ban, b)-ben, when the letter is a reference
-# resulting bad word breaking "ban" and "ben"
-# (reference fields are not expanded in spell checking, yet, only
-# for grammar checking).
-
-$PrefixLetter = [[:name = RIGHT PARENTHESIS:]];
-
-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW 
PUNCTUATION GERESH:]
-                [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER 
TEN THOUSAND SIGN:]
-                [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO 
SIGN:]
-                [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:]
-                [:name = DIGIT ZERO:]
-                [:name = DIGIT ONE:]
-                [:name = DIGIT TWO:]
-                [:name = DIGIT THREE:]
-                [:name = DIGIT FOUR:]
-                [:name = DIGIT FIVE:]
-                [:name = DIGIT SIX:]
-                [:name = DIGIT SEVEN:]
-                [:name = DIGIT EIGHT:]
-                [:name = DIGIT NINE:]
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = 
SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:]
-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL 
LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]
-              [:name = EURO SIGN:] [:name = PERCENT SIGN:] [:name = PER MILLE 
SIGN:] [:name = PER TEN THOUSAND SIGN:]
-              [:name = EN DASH:] [:name = EM DASH:]
-              [:name = RIGHT DOUBLE QUOTATION MARK:]
-              [:name = LEFT PARENTHESIS:]
-              [:name = RIGHT PARENTHESIS:]
-              [:name = RIGHT SQUARE BRACKET:]
-              [:name = EXCLAMATION MARK:]
-              [:name = QUESTION MARK:]
-              [:name = FULL STOP:] [:name = PERCENT SIGN:] [:name = SECTION 
SIGN:] [:name = DEGREE SIGN:]];  
-              
-$SufixLetter = [:name= FULL STOP:];
-
-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 
[:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE 
QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
-             [:name = PRIME:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
+##############################################################################
+
+### BEGIN CUSTOMIZATION
+### This file contains LibreOffice-specific rule customizations.
+###
+### To aid future maintainability:
+### - The change location should be bracketed by comments of this form.
+### - The original rule should be commented out, and the modified rule placed 
alongside.
+### - By doing this, maintainers can more easily compare to an upstream 
baseline.
+###
+### END CUSTOMIZATION
+
+!!chain;
+!!quoted_literals_only;
+
 
 #
 #  Character Class Definitions.
-#    The names are those from TR29.
 #
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
-
-
 
+$Han                = [:Han:];
+
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}-$Han];
+$ZWJ                = [\p{Word_Break = ZWJ}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$WSegSpace          = [\p{Word_Break = WSegSpace}];
+$Extended_Pict      = [\p{Extended_Pictographic}];
+
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Dictionary words can contain hyphens
+### tdf#49885: Sync custom BreakIterator rules with ICU originals
+### - ICU is now more permissive about punctuation inside words.
+### - For compatibility, exclude certain characters that were previously 
excluded.
+### tdf#116072: Extend MidLetter in Hungarian word breaking
+### i#56347: BreakIterator patch for Hungarian
+### i#56348: Special chars in first pos not handled by spell checking for 
Hungarian
+
+$Symbols_hu         = [[:name = PERCENT SIGN:]
+                       [:name = PER MILLE SIGN:]
+                       [:name = PER TEN THOUSAND SIGN:]
+                       [:name = SECTION SIGN:]
+                       [:name = DEGREE SIGN:]
+                       [:name = EURO SIGN:]
+                       [:name = HYPHEN-MINUS:]
+                       [:name = EN DASH:]
+                       [:name = EM DASH:]];
+
+#$ALetter            = [\p{Word_Break = ALetter}];
+$ALetter            = [\p{Word_Break = ALetter} $Symbols_hu];
+
+$IncludedML         = [:name = HYPHEN-MINUS:];
+$ExcludedML         = [[:name = COLON:]
+                       [:name = GREEK ANO TELEIA:]
+                       [:name = PRESENTATION FORM FOR VERTICAL COLON:]
+                       [:name = SMALL COLON:]
+                       [:name = FULLWIDTH COLON:]];
+
+$IncludedML_hu      = [[:name = RIGHT DOUBLE QUOTATION MARK:]
+                       [:name = LEFT PARENTHESIS:]
+                       [:name = RIGHT PARENTHESIS:]
+                       [:name = RIGHT SQUARE BRACKET:]
+                       [:name = EXCLAMATION MARK:]
+                       [:name = QUESTION MARK:]
+                       $Symbols_hu];
+
+# $MidLetter          = [\p{Word_Break = MidLetter}];
+$MidLetter          = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML 
$IncludedML_hu];
+
+### END CUSTOMIZATION
+
+$Hiragana           = [:Hiragana:];
+$Ideographic        = [\p{Ideographic}];
+
+
+#   Dictionary character set, for triggering language-based break engines. 
Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in 
Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include 
all
+#   characters requiring dictionary break.
+
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$KanaKanji $HangulSyllable];
+$dictionary     = [$ComplexContext $dictionaryCJK];
+
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+
+
+## -------------------------------------------------
+
+# Rule 3 - CR x LF
+#
+$CR $LF;
 
-####################################################################################
+# Rule 3c   Do not break within emoji zwj sequences.
+#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no 
intervening Extend chars allowed.
 #
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
+$ZWJ $Extended_Pict;
+
+# Rule 3d - Keep horizontal whitespace together.
 #
-####################################################################################
+$WSegSpace $WSegSpace;
 
-$Format    = [[:Cf:] - $TheZWSP];
+# Rule 4 - ignore Format and Extend characters, except when they appear at the 
beginning
+#          of a region of Text.
 
+$ExFm  = [$Extend $Format $ZWJ];
 
+^$ExFm+;            # This rule fires only when there are format or extend 
characters at the
+                    # start of text, or immediately following another 
boundary. It groups them, in
+                    # the event there are more than one.
 
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
+[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing 
format/extends to words,
+                                    # with no special rule status value.
 
+$Numeric $ExFm* {100};              # This group of rules also attach trailing 
format/extends, but
+$ALetterPlus $ExFm* {200};          # with rule status set based on the word's 
final base character.
+$HangulSyllable {200};
+$Hebrew_Letter $ExFm* {200};
+$Katakana $ExFm* {400};             # note:  these status values override 
those from rule 5
+$Hiragana $ExFm* {400};             #        by virtue of being numerically 
larger.
+$Ideographic $ExFm* {400};          #
 
 #
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
+# rule 5
+#    Do not break between most letters.
 #
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$SufixLetterEx= $SufixLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
+($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
 
+# rule 6 and 7
+($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
 
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
+# rule 7a
+$Hebrew_Letter $ExFm* $Single_Quote {200};
 
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $PrefixLetter? $ALetterEx ($FormatEx* $MidLetterEx? 
$FormatEx* $ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* $SufixLetterEx? {200};
+# rule 7b and 7c
+$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
 
-[[:P:][:S:]]*;
+# rule 8
 
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
+$Numeric $ExFm* $Numeric;
 
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
+# rule 9
 
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
+($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
 
-#
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
-#
+# rule 10
+
+$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 11 and 12
+
+$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
+
+# rule 13
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$Katakana $ExFm*  $Katakana {400};
+
+# rule 13a/b
+
+$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
+$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
+$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
+$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
+$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
+
+$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
+$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
 
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
+# rules 15 - 17
+#    Pairs of Regional Indicators stay together.
+#    With incoming rule chaining disabled by ^, this rule will match exactly 
two of them.
+#    No other rule begins with a Regional_Indicator, so chaining cannot extend 
the match.
 #
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $SufixLetter $Extend $Format];
+^$Regional_Indicator $ExFm* $Regional_Indicator;
 
-#!.*;
-! ($NonStarters* | 
  ) .;
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji 
found
 
+# Rule 999
+#     Match a single code point if no other rule applies.
+.;
diff --git a/i18npool/source/breakiterator/data/dict_word_nodash.txt 
b/i18npool/source/breakiterator/data/dict_word_nodash.txt
deleted file mode 100644
index 279cc50e5b66..000000000000
--- a/i18npool/source/breakiterator/data/dict_word_nodash.txt
+++ /dev/null
@@ -1,147 +0,0 @@
-#
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
-#
-#   file:  dict_word.txt   
-#
-#   ICU Word Break Rules
-#      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
-#
-
-
-
-####################################################################################
-#
-#  Character class definitions from TR 29
-#
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW 
PUNCTUATION GERESH:]
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = 
SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL 
LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:] [:name = PRIME:] ];  
-
-$SufixLetter = [:name= FULL STOP:];
-              
-
-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 
[:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE 
QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
-             [:name = PRIME:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
-
-#
-#  Character Class Definitions.
-#    The names are those from TR29.
-#
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
-
-
-
-
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
-
-$Format    = [[:Cf:] - $TheZWSP];
-
-
-
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
-
-
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$SufixLetterEx= $SufixLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
-
-
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
-
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* 
$ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* $SufixLetterEx? {200};
-
-[[:P:][:S:]]*;
-
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
-
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
-
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
-
-#
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
-#
-
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
-#
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $SufixLetter $Extend $Format];
-
-#!.*;
-! ($NonStarters* | 
  ) .;
-
diff --git a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt 
b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
index fb29b478af21..b39503d1b405 100644
--- a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
+++ b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
@@ -1,157 +1,221 @@
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (C) 2002-2016, International Business Machines Corporation
+# and others. All Rights Reserved.
 #
-#   file:  dict_word.txt   
+# file:  word.txt
 #
-#   ICU Word Break Rules
+# ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
 
-
-
-####################################################################################
+##############################################################################
 #
 #  Character class definitions from TR 29
 #
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
+##############################################################################
 
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
+### BEGIN CUSTOMIZATION
+### This file contains LibreOffice-specific rule customizations.
+###
+### To aid future maintainability:
+### - The change location should be bracketed by comments of this form.
+### - The original rule should be commented out, and the modified rule placed 
alongside.
+### - By doing this, maintainers can more easily compare to an upstream 
baseline.
+###
+### END CUSTOMIZATION
 
-# list of dashes or hyphens that should be accepted as part of the word if a 
single one of these
-# pre- or postfixes a word. E.g. in German: "Arbeits-" or "-nehmer" where that 
hyphen needs to
-# be part of the word in order to have it properly spell checked etc.
-$PrePostDashHyphen = [ [:name = HYPHEN-MINUS:] ];
+!!chain;
+!!quoted_literals_only;
 
 
-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW 
PUNCTUATION GERESH:]
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = 
SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL 
LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:] [:name = PRIME:] 
-              [:name = HYPHEN-MINUS:] ];
+#
+#  Character Class Definitions.
+#
 
-$SufixLetter = [:name= FULL STOP:];
-              
+$Han                = [:Han:];
 
-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 
[:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE 
QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
-             [:name = PRIME:]];
-$Numeric   = [:LineBreak = Numeric:];
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}-$Han];
+$ZWJ                = [\p{Word_Break = ZWJ}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$ALetter            = [\p{Word_Break = ALetter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$WSegSpace          = [\p{Word_Break = WSegSpace}];
+$Extended_Pict      = [\p{Extended_Pictographic}];
 
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Dictionary words can contain hyphens
+### tdf#49885: Sync custom BreakIterator rules with ICU originals
+### - ICU is now more permissive about punctuation inside words.
+### - For compatibility, exclude certain characters that were previously 
excluded.
 
-$TheZWSP = \u200b;
+$IncludedML         = [:name = HYPHEN-MINUS:];
+$ExcludedML         = [[:name = COLON:]
+                       [:name = GREEK ANO TELEIA:]
+                       [:name = PRESENTATION FORM FOR VERTICAL COLON:]
+                       [:name = SMALL COLON:]
+                       [:name = FULLWIDTH COLON:]];
 
-#
-#  Character Class Definitions.
-#    The names are those from TR29.
-#
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+# $MidLetter          = [\p{Word_Break = MidLetter}];
+$MidLetter          = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML];
 
+### END CUSTOMIZATION
 
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Allow leading and trailing hyphens in certain 
languages
+### This part of the customization does not replace any rules.
 
+$PrePostHyphen      = [:name = HYPHEN-MINUS:];
 
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
+### END CUSTOMIZATION
 
-$Format    = [[:Cf:] - $TheZWSP];
+$Hiragana           = [:Hiragana:];
+$Ideographic        = [\p{Ideographic}];
 
 
+#   Dictionary character set, for triggering language-based break engines. 
Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in 
Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include 
all
+#   characters requiring dictionary break.
 
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$KanaKanji $HangulSyllable];
+$dictionary     = [$ComplexContext $dictionaryCJK];
 
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
 
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$SufixLetterEx= $SufixLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 
 
+## -------------------------------------------------
+
+# Rule 3 - CR x LF
 #
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
+$CR $LF;
 
+# Rule 3c   Do not break within emoji zwj sequences.
+#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no 
intervening Extend chars allowed.
 #
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
+$ZWJ $Extended_Pict;
+
+# Rule 3d - Keep horizontal whitespace together.
 #
-# At most one leading or trailing dash/hyphen should be accepted as well.
-# E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to
-# be part of the word in order to have it properly spell checked etc.
-$LetterSequence = $PrePostDashHyphen? $ALetterEx ($FormatEx* $MidLetterEx? 
$FormatEx* $ALetterEx)* $PrePostDashHyphen?;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* $SufixLetterEx? {200};
+$WSegSpace $WSegSpace;
 
-[[:P:][:S:]]*;
+# Rule 4 - ignore Format and Extend characters, except when they appear at the 
beginning
+#          of a region of Text.
 
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
+$ExFm  = [$Extend $Format $ZWJ];
 
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
+^$ExFm+;            # This rule fires only when there are format or extend 
characters at the
+                    # start of text, or immediately following another 
boundary. It groups them, in
+                    # the event there are more than one.
 
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
+[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing 
format/extends to words,
+                                    # with no special rule status value.
+
+$Numeric $ExFm* {100};              # This group of rules also attach trailing 
format/extends, but
+$ALetterPlus $ExFm* {200};          # with rule status set based on the word's 
final base character.
+$HangulSyllable {200};
+$Hebrew_Letter $ExFm* {200};
+$Katakana $ExFm* {400};             # note:  these status values override 
those from rule 5
+$Hiragana $ExFm* {400};             #        by virtue of being numerically 
larger.
+$Ideographic $ExFm* {400};          #
 
 #
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
+# rule 5
+#    Do not break between most letters.
 #
 
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Allow leading and trailing hyphens in certain 
languages
+
+# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
+($PrePostHyphen) ? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | 
$Hebrew_Letter) ($PrePostHyphen)?;
+
+### END CUSTOMIZATION
+
+# rule 6 and 7
+
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Allow leading and trailing hyphens in certain 
languages
+
+# ($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
+($PrePostHyphen)? ($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | 
$MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) 
($PrePostHyphen)? {200};
+
+### END CUSTOMIZATION
+
+# rule 7a
+$Hebrew_Letter $ExFm* $Single_Quote {200};
+
+# rule 7b and 7c
+$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
+
+# rule 8
+
+$Numeric $ExFm* $Numeric;
+
+# rule 9
+
+($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
+
+# rule 10
+
+$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 11 and 12
+
+$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
+
+# rule 13
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$Katakana $ExFm*  $Katakana {400};
+
+# rule 13a/b
+
+$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
+$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
+$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
+$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
+$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
+
+$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
+$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
+
+# rules 15 - 17
+#    Pairs of Regional Indicators stay together.
+#    With incoming rule chaining disabled by ^, this rule will match exactly 
two of them.
+#    No other rule begins with a Regional_Indicator, so chaining cannot extend 
the match.
 #
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $SufixLetter $Extend $Format];
+^$Regional_Indicator $ExFm* $Regional_Indicator;
 
-#!.*;
-! ($NonStarters* | 
  ) .;
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji 
found
 
+# Rule 999
+#     Match a single code point if no other rule applies.
+.;
diff --git a/i18npool/source/breakiterator/data/edit_word.txt 
b/i18npool/source/breakiterator/data/edit_word.txt
index 92b344c19d41..14fc221aa96e 100644
--- a/i18npool/source/breakiterator/data/edit_word.txt
+++ b/i18npool/source/breakiterator/data/edit_word.txt
@@ -1,142 +1,199 @@
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (C) 2002-2016, International Business Machines Corporation
+# and others. All Rights Reserved.
 #
-#   file:  edit_word.txt   
+# file:  word.txt
 #
-#   ICU Word Break Rules
+# ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
 
-
-
-####################################################################################
+##############################################################################
 #
 #  Character class definitions from TR 29
 #
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-$ALetter   = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW 
PUNCTUATION GERESH:] 
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  [:name = HEBREW 
PUNCTUATION GERSHAYIM:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:]];  
-              
-$MidNum    = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
+##############################################################################
+
+### BEGIN CUSTOMIZATION
+### This file contains LibreOffice-specific rule customizations.
+###
+### To aid future maintainability:
+### - The change location should be bracketed by comments of this form.
+### - The original rule should be commented out, and the modified rule placed 
alongside.
+### - By doing this, maintainers can more easily compare to an upstream 
baseline.
+###
+### END CUSTOMIZATION
+
+!!chain;
+!!quoted_literals_only;
+
 
 #
 #  Character Class Definitions.
-#    The names are those from TR29.
 #
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
 
+$Han                = [:Han:];
 
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}-$Han];
+$ZWJ                = [\p{Word_Break = ZWJ}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$ALetter            = [\p{Word_Break = ALetter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidLetter          = [\p{Word_Break = MidLetter}];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}];
+$WSegSpace          = [\p{Word_Break = WSegSpace}];
+$Extended_Pict      = [\p{Extended_Pictographic}];
 
+### BEGIN CUSTOMIZATION
+### i#13494: For the purposes of editing, standalone punctuation should be 
treated as a word.
+### This change subtracts undesired characters from the above families
 
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
+# $MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]];
 
-$Format    = [[:Cf:] - $TheZWSP];
+# $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]];
 
+### END CUSTOMIZATION
 
+$Hiragana           = [:Hiragana:];
+$Ideographic        = [\p{Ideographic}];
 
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
 
+#   Dictionary character set, for triggering language-based break engines. 
Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in 
Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include 
all
+#   characters requiring dictionary break.
 
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$KanaKanji $HangulSyllable];
+$dictionary     = [$ComplexContext $dictionaryCJK];
 
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
 
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* 
$ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* {200};
 
-# Punctuations by themselves
-[[:P:][:S:]-[:name = FULL STOP:]]*;
-[[:name = FULL STOP:]]*;
+## -------------------------------------------------
 
+# Rule 3 - CR x LF
 #
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
+$CR $LF;
 
+# Rule 3c   Do not break within emoji zwj sequences.
+#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no 
intervening Extend chars allowed.
 #
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
+$ZWJ $Extended_Pict;
 
+# Rule 3d - Keep horizontal whitespace together.
 #
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
+$WSegSpace $WSegSpace;
+
+# Rule 4 - ignore Format and Extend characters, except when they appear at the 
beginning
+#          of a region of Text.
+
+$ExFm  = [$Extend $Format $ZWJ];
+
+^$ExFm+;            # This rule fires only when there are format or extend 
characters at the
+                    # start of text, or immediately following another 
boundary. It groups them, in
+                    # the event there are more than one.
+
+[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing 
format/extends to words,
+                                    # with no special rule status value.
+
+$Numeric $ExFm* {100};              # This group of rules also attach trailing 
format/extends, but
+$ALetterPlus $ExFm* {200};          # with rule status set based on the word's 
final base character.
+$HangulSyllable {200};
+$Hebrew_Letter $ExFm* {200};
+$Katakana $ExFm* {400};             # note:  these status values override 
those from rule 5
+$Hiragana $ExFm* {400};             #        by virtue of being numerically 
larger.
+$Ideographic $ExFm* {400};          #
 
 #
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
+# rule 5
+#    Do not break between most letters.
 #
+($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 6 and 7
+($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
+
+# rule 7a
+$Hebrew_Letter $ExFm* $Single_Quote {200};
+
+# rule 7b and 7c
+$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
+
+# rule 8
+
+$Numeric $ExFm* $Numeric;
+
+# rule 9
+
+($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
 
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
+# rule 10
+
+$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 11 and 12
+
+$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
+
+# rule 13
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$Katakana $ExFm*  $Katakana {400};
+
+# rule 13a/b
+
+$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
+$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
+$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
+$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
+$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
+
+$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
+$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
+
+# rules 15 - 17
+#    Pairs of Regional Indicators stay together.
+#    With incoming rule chaining disabled by ^, this rule will match exactly 
two of them.
+#    No other rule begins with a Regional_Indicator, so chaining cannot extend 
the match.
 #
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $Extend $Format];
+^$Regional_Indicator $ExFm* $Regional_Indicator;
 
-#!.*;
-! ($NonStarters* | 
  ) .;
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji 
found
+
+### BEGIN CUSTOMIZATION
+### i#13494: For the purposes of editing, standalone punctuation should be 
treated as a word.
+### This customization does not replace any rules.
+[[:P:][:S:]-[:name = FULL STOP:]]*
+[[:name = FULL STOP:]]*;
+### END CUSTOMIZATION
 
+# Rule 999
+#     Match a single code point if no other rule applies.
+.;
diff --git a/i18npool/source/breakiterator/data/edit_word_he.txt 
b/i18npool/source/breakiterator/data/edit_word_he.txt
deleted file mode 100644
index 0b5908814e08..000000000000
--- a/i18npool/source/breakiterator/data/edit_word_he.txt
+++ /dev/null
@@ -1,142 +0,0 @@
-#
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
-#
-#   file:  edit_word.txt   
-#
-#   ICU Word Break Rules
-#      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
-#
-
-
-
-####################################################################################
-#
-#  Character class definitions from TR 29
-#
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-$ALetter   = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW 
PUNCTUATION GERESH:] 
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = MIDDLE 
DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:]];  
-              
-$MidNum    = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
-
-#
-#  Character Class Definitions.
-#    The names are those from TR29.
-#
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
-
-
-
-
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
-
-$Format    = [[:Cf:] - $TheZWSP];
-
-
-
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
-
-
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
-
-
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
-
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* 
$ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* {200};
-
-# Punctuations by themselves
-[[:P:][:S:]-[:name = FULL STOP:]]*;
-[[:name = FULL STOP:]]*;
-
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
-
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
-
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
-
-#
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
-#
-
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
-#
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $Extend $Format];
-
-#!.*;
-! ($NonStarters* | 
  ) .;
-
diff --git a/i18npool/source/breakiterator/data/edit_word_hu.txt 
b/i18npool/source/breakiterator/data/edit_word_hu.txt
index 4a08acab0029..389ad2bacc13 100644
--- a/i18npool/source/breakiterator/data/edit_word_hu.txt
+++ b/i18npool/source/breakiterator/data/edit_word_hu.txt
@@ -1,159 +1,215 @@
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (C) 2002-2016, International Business Machines Corporation
+# and others. All Rights Reserved.
 #
-#   file:  edit_word.txt   
+# file:  word.txt
 #
-#   ICU Word Break Rules
+# ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
 
-
-
-####################################################################################
+##############################################################################
 #
 #  Character class definitions from TR 29
 #
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-$ALetter   = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW 
PUNCTUATION GERESH:] 
-                [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER 
TEN THOUSAND SIGN:]
-                [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO 
SIGN:]
-                [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:]
-                [:name = DIGIT ZERO:]
-                [:name = DIGIT ONE:]
-                [:name = DIGIT TWO:]
-                [:name = DIGIT THREE:]
-                [:name = DIGIT FOUR:]
-                [:name = DIGIT FIVE:]
-                [:name = DIGIT SIX:]
-                [:name = DIGIT SEVEN:]
-                [:name = DIGIT EIGHT:]
-                [:name = DIGIT NINE:]
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  [:name = HEBREW 
PUNCTUATION GERSHAYIM:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:]  
-              [:name = HYPHEN-MINUS:] [:name = EURO SIGN:] [:name = PERCENT 
SIGN:] 
-              [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:]
-              [:name = EN DASH:] [:name = EM DASH:]
-              [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE 
SIGN:]];
-              
-$MidNum    = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
+##############################################################################
+
+### BEGIN CUSTOMIZATION
+### This file contains LibreOffice-specific rule customizations.
+###
+### To aid future maintainability:
+### - The change location should be bracketed by comments of this form.
+### - The original rule should be commented out, and the modified rule placed 
alongside.
+### - By doing this, maintainers can more easily compare to an upstream 
baseline.
+###
+### END CUSTOMIZATION
+
+!!chain;
+!!quoted_literals_only;
+
 
 #
 #  Character Class Definitions.
-#    The names are those from TR29.
 #
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
 
+$Han                = [:Han:];
 
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}-$Han];
+$ZWJ                = [\p{Word_Break = ZWJ}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}];
+$WSegSpace          = [\p{Word_Break = WSegSpace}];
+$Extended_Pict      = [\p{Extended_Pictographic}];
 
+### BEGIN CUSTOMIZATION
+### i#13494: For the purposes of editing, standalone punctuation should be 
treated as a word.
+### This change subtracts undesired characters from the above families
+### i#56347: BreakIterator patch for Hungarian
+### i#56348: Special chars in first pos not handled by spell checking for 
Hungarian
 
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
+$Symbols_hu         = [[:name = PERCENT SIGN:]
+                       [:name = PER MILLE SIGN:]
+                       [:name = PER TEN THOUSAND SIGN:]
+                       [:name = SECTION SIGN:]
+                       [:name = DEGREE SIGN:]
+                       [:name = EURO SIGN:]
+                       [:name = HYPHEN-MINUS:]
+                       [:name = EN DASH:]
+                       [:name = EM DASH:]];
 
-$Format    = [[:Cf:] - $TheZWSP];
+# $ALetter            = [\p{Word_Break = ALetter}];
+$ALetter            = [\p{Word_Break = ALetter} $Symbols_hu];
 
+# $MidLetter          = [\p{Word_Break = MidLetter}];
+$MidLetter          = [\p{Word_Break = MidLetter} $Symbols_hu];
 
+# $MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]];
 
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
+# $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]];
 
+### END CUSTOMIZATION
 
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
+$Hiragana           = [:Hiragana:];
+$Ideographic        = [\p{Ideographic}];
 
 
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
+#   Dictionary character set, for triggering language-based break engines. 
Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in 
Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include 
all
+#   characters requiring dictionary break.
 
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* 
$ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* {200};
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$KanaKanji $HangulSyllable];
+$dictionary     = [$ComplexContext $dictionaryCJK];
 
-# Punctuations by themselves
-[[:P:][:S:]-[:name = FULL STOP:]]*;
-[[:name = FULL STOP:]]*;
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
 
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 
+
+## -------------------------------------------------
+
+# Rule 3 - CR x LF
 #
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
+$CR $LF;
 
+# Rule 3c   Do not break within emoji zwj sequences.
+#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no 
intervening Extend chars allowed.
 #
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
+$ZWJ $Extended_Pict;
+
+# Rule 3d - Keep horizontal whitespace together.
 #
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
+$WSegSpace $WSegSpace;
+
+# Rule 4 - ignore Format and Extend characters, except when they appear at the 
beginning
+#          of a region of Text.
+
+$ExFm  = [$Extend $Format $ZWJ];
+
+^$ExFm+;            # This rule fires only when there are format or extend 
characters at the
+                    # start of text, or immediately following another 
boundary. It groups them, in
+                    # the event there are more than one.
+
+[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing 
format/extends to words,
+                                    # with no special rule status value.
+
+$Numeric $ExFm* {100};              # This group of rules also attach trailing 
format/extends, but
+$ALetterPlus $ExFm* {200};          # with rule status set based on the word's 
final base character.
+$HangulSyllable {200};
+$Hebrew_Letter $ExFm* {200};
+$Katakana $ExFm* {400};             # note:  these status values override 
those from rule 5
+$Hiragana $ExFm* {400};             #        by virtue of being numerically 
larger.
+$Ideographic $ExFm* {400};          #
 
 #
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
+# rule 5
+#    Do not break between most letters.
 #
+($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 6 and 7
+($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
+
+# rule 7a
+$Hebrew_Letter $ExFm* $Single_Quote {200};
+
+# rule 7b and 7c
+$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
+
+# rule 8
+
+$Numeric $ExFm* $Numeric;
+
+# rule 9
 
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
+($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
+
+# rule 10
+
+$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 11 and 12
+
+$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
+
+# rule 13
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$Katakana $ExFm*  $Katakana {400};
+
+# rule 13a/b
+
+$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
+$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
+$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
+$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
+$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
+
+$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
+$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
+
+# rules 15 - 17
+#    Pairs of Regional Indicators stay together.
+#    With incoming rule chaining disabled by ^, this rule will match exactly 
two of them.
+#    No other rule begins with a Regional_Indicator, so chaining cannot extend 
the match.
 #
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $Extend $Format];
+^$Regional_Indicator $ExFm* $Regional_Indicator;
 
-#!.*;
-! ($NonStarters* | 
  ) .;
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji 
found
+
+### BEGIN CUSTOMIZATION
+### i#13494: For the purposes of editing, standalone punctuation should be 
treated as a word.
+### This customization does not replace any rules.
+[[:P:][:S:]-[:name = FULL STOP:]]*
+[[:name = FULL STOP:]]*;
+### END CUSTOMIZATION
 
+# Rule 999
+#     Match a single code point if no other rule applies.
+.;
diff --git a/i18npool/source/breakiterator/data/line.txt 
b/i18npool/source/breakiterator/data/line.txt
index ff3f3eafc42e..46a618c63cae 100644
--- a/i18npool/source/breakiterator/data/line.txt
+++ b/i18npool/source/breakiterator/data/line.txt
@@ -1,176 +1,116 @@
-# Copyright (c) 2002-2006  International Business Machines Corporation and
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2002-2016  International Business Machines Corporation and
 # others. All Rights Reserved.
 #
 #  file:  line.txt
 #
 #         Line Breaking Rules
-#         Implement default line breaking as defined by Unicode Standard Annex 
#14 version 5.0.0
-#         http://www.unicode.org/reports/tr14/
-
-
+#         Implement default line breaking as defined by
+#         Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/)
+#         for Unicode 14.0, with the following modification:
+#
+#         Boundaries between hyphens and following letters are suppressed when
+#         there is a boundary preceding the hyphen. See rule 20.9
+#
+#         This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
+#         It sets characters of class CJ to behave like NS.
 
 #
 #  Character Classes defined by TR 14.
 #
 
-!!chain;
-!!LBCMNoChain;
+### BEGIN CUSTOMIZATION
+### This file contains LibreOffice-specific rule customizations.
+###
+### To aid future maintainability:
+### - The change location should be bracketed by comments of this form.
+### - The original rule should be commented out, and the modified rule placed 
alongside.
+### - By doing this, maintainers can more easily compare to an upstream 
baseline.
+###
+### END CUSTOMIZATION
 
-
-!!lookAheadHardBreak;
-#
-#  !!lookAheadHardBreak    Described here because it is (as yet) undocumented 
elsewhere
-#                          and only used for the line break rules.
-#
-#           It is used in the implementation of the incredibly annoying rule 
LB 10
-#           which says to treat any combining mark that is not attached to a 
base
-#           character as if it were of class AL  (alphabetic).
-#
-#           The problem occurs in the reverse rules.
-#
-#           Consider a sequence like, with correct breaks as shown
-#               LF  ID  CM  AL  AL
-#                  ^       ^       ^
-#           Then consider the sequence without the initial ID (ideographic)
-#                 LF  CM  AL  AL
-#                    ^           ^
-#           Our CM, which in the first example was attached to the ideograph,
-#           is now unattached, becomes an alpha, and joins in with the other
-#           alphas.
-#
-#           When iterating forwards, these sequences do not present any 
problems
-#           When iterating backwards, we need to look ahead when encountering
-#           a CM to see whether it attaches to something further on or not.
-#           (Look-ahead in a reverse rule is looking towards the start)
-#
-#           If the CM is unattached, we need to force a break.
-#
-#           !!lookAheadHardBreak forces the run time state machine to
-#           stop immediately when a look ahead rule ( '/' operator) matches,
-#           and set the match position to that of the look-ahead operator,
-#           no matter what other rules may be in play at the time.
-#
-#           See rule LB 19 for an example.
-#
+!!chain;
+!!quoted_literals_only;
 
 $AI = [:LineBreak =  Ambiguous:];
-$DG = \u00B0;
-$AL = [[:LineBreak =  Alphabetic:] $DG];
+$AL = [:LineBreak =  Alphabetic:];
 $BA = [:LineBreak =  Break_After:];
+$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
 $BB = [:LineBreak =  Break_Before:];
 $BK = [:LineBreak =  Mandatory_Break:];
 $B2 = [:LineBreak =  Break_Both:];
 $CB = [:LineBreak =  Contingent_Break:];
 $CJ = [:LineBreak =  Conditional_Japanese_Starter:];
-$CL = [[:LineBreak =  Close_Punctuation:] [:LineBreak = Close_Parenthesis:]]; 
# tdf#31271
-$CM = [:LineBreak =  Combining_Mark:];
+$CL = [:LineBreak =  Close_Punctuation:];
+# $CM = [:LineBreak =  Combining_Mark:];
+$CP = [:LineBreak =  Close_Parenthesis:];
 $CR = [:LineBreak =  Carriage_Return:];
+$EB = [:LineBreak =  EB:];
+$EM = [:LineBreak =  EM:];
 $EX = [:LineBreak =  Exclamation:];
 $GL = [:LineBreak =  Glue:];
 $HL = [:LineBreak =  Hebrew_Letter:];
 $HY = [:LineBreak =  Hyphen:];
 $H2 = [:LineBreak =  H2:];
 $H3 = [:LineBreak =  H3:];
-$ID = [[:LineBreak =  Ideographic:] - [\ufe30]];
-$IN = [:LineBreak =  Inseparable:];
-$IS = [[:LineBreak =  Infix_Numeric:] [\ufe30]];
+$ID = [:LineBreak =  Ideographic:];
+$IN = [:LineBreak =  Inseperable:];
+$IS = [:LineBreak =  Infix_Numeric:];
 $JL = [:LineBreak =  JL:];
 $JV = [:LineBreak =  JV:];
 $JT = [:LineBreak =  JT:];
 $LF = [:LineBreak =  Line_Feed:];
 $NL = [:LineBreak =  Next_Line:];
+# NS includes CJ for CSS strict line breaking.
 $NS = [[:LineBreak =  Nonstarter:] $CJ];
 $NU = [:LineBreak =  Numeric:];
-$OP = [[:LineBreak =  Open_Punctuation:] - $DG];
+$OP = [:LineBreak =  Open_Punctuation:];
 $PO = [:LineBreak =  Postfix_Numeric:];
-$BS = \u005C;
-$PR = [[:LineBreak =  Prefix_Numeric:] - $BS];
+$PR = [:LineBreak =  Prefix_Numeric:];
 $QU = [:LineBreak =  Quotation:];
+$RI = [:LineBreak =  Regional_Indicator:];
 $SA = [:LineBreak =  Complex_Context:];
 $SG = [:LineBreak =  Surrogate:];
 $SP = [:LineBreak =  Space:];
-$SY = [[:LineBreak =  Break_Symbols:] $BS];
+$SY = [:LineBreak =  Break_Symbols:];
 $WJ = [:LineBreak =  Word_Joiner:];
 $XX = [:LineBreak =  Unknown:];
 $ZW = [:LineBreak =  ZWSpace:];
+$ZWJ = [:LineBreak = ZWJ:];
+
+# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 
from UAX 14,
+# without a formal name. Because ICU rules require multiple uses of the 
expressions,
+# give them a single definition with a name
+
+$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
+$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
+
+$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
+
+# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM 
avoids having to explicitly
+#         list it in the numerous rules that use CM.
+# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
+
+$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
+$CMX = [[$CM] - [$ZWJ]];
 
 #   Dictionary character set, for triggering language-based break engines. 
Currently
-#   limited to LineBreak=Complex_Context. Note that this set only works in 
Unicode
-#   5.0 or later as the definition of Complex_Context was corrected to include 
all
-#   characters requiring dictionary break.
+#   limited to LineBreak=Complex_Context (SA).
 
-$dictionary = [:LineBreak = Complex_Context:];
+$dictionary = [$SA];
 
 #
 #  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian 
width),
-#                               SA  (South East Asian: Thai, Lao, Khmer)
+#                               SA  (Dictionary chars, excluding Mn and Mc)
 #                               SG  (Unpaired Surrogates)
 #                               XX  (Unknown, unassigned)
 #                         as $AL  (Alphabetic)
 #
-$ALPlus = [$AL $AI $SA $SG $XX];
-
-#
-#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
-#
-$ALcm = $ALPlus $CM*;
-$BAcm = $BA $CM*;
-$BBcm = $BB $CM*;
-$B2cm = $B2 $CM*;
-$CLcm = $CL $CM*;
-$EXcm = $EX $CM*;
-$GLcm = $GL $CM*;
-$HLcm = $HL $CM*;
-$HYcm = $HY $CM*;
-$H2cm = $H2 $CM*;
-$H3cm = $H3 $CM*;
-$IDcm = $ID $CM*;
-$INcm = $IN $CM*;
-$IScm = $IS $CM*;
-$JLcm = $JL $CM*;
-$JVcm = $JV $CM*;
-$JTcm = $JT $CM*;
-$NScm = $NS $CM*;
-$NUcm = $NU $CM*;
-$OPcm = $OP $CM*;
-$POcm = $PO $CM*;
-$PRcm = $PR $CM*;
-$QUcm = $QU $CM*;
-$SYcm = $SY $CM*;
-$WJcm = $WJ $CM*;
+$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
 
-## -------------------------------------------------
 
-!!forward;
-
-#
-#  Each class of character can stand by itself as an unbroken token, with 
trailing combining stuff
-#
-$ALPlus $CM+;
-$BA $CM+;
-$BB $CM+;
-$B2 $CM+;
-$CL $CM+;
-$EX $CM+;
-$GL $CM+;
-$HL $CM+;
-$HY $CM+;
-$H2 $CM+;
-$H3 $CM+;
-$ID $CM+;
-$IN $CM+;
-$IS $CM+;
-$JL $CM+;
-$JV $CM+;
-$JT $CM+;
-$NS $CM+;
-$NU $CM+;
-$OP $CM+;
-$PO $CM+;
-$PR $CM+;
-$QU $CM+;
-$SY $CM+;
-$WJ $CM+;
+## -------------------------------------------------
 
 #
 # CAN_CM  is the set of characters that may combine with CM combining chars.
@@ -186,19 +126,15 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases 
that can't take CMs
 #
 # AL_FOLLOW  set of chars that can unconditionally follow an AL
 #            Needed in rules where stand-alone $CM s are treated as AL.
-#            Chaining is disabled with CM because it causes other failures,
-#            so for this one case we need to manually list out longer 
sequences.
 #
-$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
-$AL_FOLLOW_CM   = [$CL $EX $HL $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus 
$OP];
-$AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+$AL_FOLLOW      = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL 
$OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
 
 
 #
 #  Rule LB 4, 5    Mandatory (Hard) breaks.
 #
 $LB4Breaks    = [$BK $CR $LF $NL];
-$LB4NonBreaks = [^$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
 $CR $LF {100};
 
 #
@@ -206,91 +142,124 @@ $CR $LF {100};
 #
 $LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
 $CAN_CM $CM*    $LB4Breaks {100};
-$CM+            $LB4Breaks {100};
+^$CM+           $LB4Breaks {100};
 
 # LB 7         x SP
 #              x ZW
 $LB4NonBreaks [$SP $ZW];
 $CAN_CM $CM*  [$SP $ZW];
-$CM+          [$SP $ZW];
+^$CM+         [$SP $ZW];
 
 #
 # LB 8         Break after zero width space
+#              ZW SP* ÷
 #
 $LB8Breaks    = [$LB4Breaks $ZW];
 $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
 
+# LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
+#
+$ZWJ [^$CM];
 
-# LB 9     Combining marks.      X   $CM needs to behave like X, where X is 
not $SP, $BK $CR $LF $NL 
-#                                $CM not covered by the above needs to behave 
like $AL   
+# LB 9     Combining marks.      X   $CM needs to behave like X, where X is 
not $SP, $BK $CR $LF $NL
-e 
... etc. - the rest is truncated

core.git: Branch 'distro/allotropia/zeta-24-2' - i18npool/CustomTarget_breakiterator.mk i18npool/qa i18npool/source

Reply via email to