* tests/uniwbrk/test-uc-wordbreaks.c (wordbreakproperty_to_string): Support WBP_DQ, WBP_SQ, and WBP_HL.
* lib/uniwbrk.in.h (WBP_DQ, WBP_SQ, WBP_HL): New enumeration values. * lib/uniwbrk/u-wordbreaks.h (FUNC): Support WB7a, WB7b, and WB7c. Update WB5, WB6, WB7, WB9, WB11, WB12, WB13a, and WB13b. * lib/uniwbrk/wbrktable.h (uniwbrk_table): Adjust table size. * lib/uniwbrk/wbrktable.c (uniwbrk_table): Support rule WB7a. Update WB5, WB9, WB10, WB13a, and WB13b. * lib/gen-uni-tables.c (UC_BIDI_LRI, UC_BIDI_RLI, UC_BIDI_FSI) (UC_BIDI_PDI): New enumeration values. (bidi_category_byname): Support those enum values. (is_WBP_MIDNUMLET): Exclude 0x0027 (SINGLE QUOTE), which is now a dedicated property assigned. (is_property_case_ignorable): Check 0x0027. (WBP_DQ, WBP_SQ, WBP_HL): New enumeration values. (get_wbp, debug_output_wbp, fill_org_wbp, debug_output_org_wbp) (output_wbp): Support those enum values. * lib/unictype.in.h (UC_BIDI_LRI, UC_BIDI_RLI, UC_BIDI_FSI) (UC_BIDI_PDI): New enumeration values. * lib/unictype/bidi_byname.gperf: Add those property names. --- lib/gen-uni-tables.c | 76 ++++++++++++++++++++++++++++++++++---- lib/unictype.in.h | 6 ++- lib/unictype/bidi_byname.gperf | 12 ++++++ lib/uniwbrk.in.h | 5 ++- lib/uniwbrk/u-wordbreaks.h | 38 ++++++++++++------- lib/uniwbrk/wbrktable.c | 52 ++++++++++++++------------ lib/uniwbrk/wbrktable.h | 2 +- tests/uniwbrk/test-uc-wordbreaks.c | 3 ++ 8 files changed, 145 insertions(+), 49 deletions(-) diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index f833777..af541d1 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -32,7 +32,7 @@ /usr/local/share/Unidata/CompositionExclusions.txt \ /usr/local/share/Unidata/SpecialCasing.txt \ /usr/local/share/Unidata/CaseFolding.txt \ - 6.2.0 + 6.3.0 */ #include <stdbool.h> @@ -1307,7 +1307,11 @@ enum UC_BIDI_B, /* Paragraph Separator */ UC_BIDI_S, /* Segment Separator */ UC_BIDI_WS, /* Whitespace */ - UC_BIDI_ON /* Other Neutral */ + UC_BIDI_ON, /* Other Neutral */ + UC_BIDI_LRI, /* Left-to-Right Isolate */ + UC_BIDI_RLI, /* Right-to-Left Isolate */ + UC_BIDI_FSI, /* First Strong Isolate */ + UC_BIDI_PDI /* Pop Directional Isolate */ }; static int @@ -1365,7 +1369,20 @@ bidi_category_byname (const char *category_name) break; } break; - case 'L': + case 'F': + switch (category_name[1]) + { + case 'S': + switch (category_name[2]) + { + case 'I': + if (category_name[3] == '\0') + return UC_BIDI_FSI; + break; + } + } + break; + case 'L': switch (category_name[1]) { case '\0': @@ -1381,7 +1398,11 @@ bidi_category_byname (const char *category_name) if (category_name[3] == '\0') return UC_BIDI_LRO; break; - } + case 'I': + if (category_name[3] == '\0') + return UC_BIDI_LRI; + break; + } break; } break; @@ -1418,6 +1439,10 @@ bidi_category_byname (const char *category_name) if (category_name[3] == '\0') return UC_BIDI_PDF; break; + case 'I': + if (category_name[3] == '\0') + return UC_BIDI_PDI; + break; } break; } @@ -1438,7 +1463,11 @@ bidi_category_byname (const char *category_name) if (category_name[3] == '\0') return UC_BIDI_RLO; break; - } + case 'I': + if (category_name[3] == '\0') + return UC_BIDI_RLI; + break; + } break; } break; @@ -2518,7 +2547,7 @@ output_mirror (const char *filename, const char *version) static bool is_WBP_MIDNUMLET (unsigned int ch) { - return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019 + return (ch == 0x002E || ch == 0x2018 || ch == 0x2019 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E); } @@ -2999,6 +3028,7 @@ static bool is_property_case_ignorable (unsigned int ch) { bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch) + || ch == 0x0027 || is_category_Mn (ch) || is_category_Me (ch) || is_category_Cf (ch) @@ -7467,7 +7497,10 @@ enum WBP_MIDNUM = 5, WBP_NUMERIC = 6, WBP_EXTENDNUMLET = 7, - WBP_RI = 13 + WBP_RI = 13, + WBP_DQ = 14, + WBP_SQ = 15, + WBP_HL = 16 }; /* Returns the word breaking property for ch, as a bit mask. */ @@ -7506,6 +7539,11 @@ get_wbp (unsigned int ch) || ch == 0xFF70) attr |= 1 << WBP_KATAKANA; + if ((unicode_scripts[ch] < numscripts + && strcmp (scripts[unicode_scripts[ch]], "Hebrew") == 0) + && strcmp (unicode_attributes[ch].category, "Lo") == 0) + attr |= 1 << WBP_HL; + if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0 || ch == 0x05F3) && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0 @@ -7513,7 +7551,8 @@ get_wbp (unsigned int ch) && ((get_lbp (ch) >> LBP_SA) & 1) == 0 && !(unicode_scripts[ch] < numscripts && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0) - && (attr & (1 << WBP_EXTEND)) == 0) + && (attr & (1 << WBP_EXTEND)) == 0 + && (attr & (1 << WBP_HL)) == 0) attr |= 1 << WBP_ALETTER; if (is_WBP_MIDNUMLET (ch)) @@ -7538,6 +7577,12 @@ get_wbp (unsigned int ch) if (((get_lbp (ch) >> LBP_RI) & 1) != 0) attr |= 1 << WBP_RI; + + if (ch == 0x0022) + attr |= 1 << WBP_DQ; + + if (ch == 0x0027) + attr |= 1 << WBP_SQ; } if (attr == 0) @@ -7585,6 +7630,12 @@ debug_output_wbp (FILE *stream) fprintf (stream, " ExtendNumLet"); if (attr & (1 << WBP_RI)) fprintf (stream, " Regional_Indicator"); + if (attr & (1 << WBP_DQ)) + fprintf (stream, " Double_Quote"); + if (attr & (1 << WBP_SQ)) + fprintf (stream, " Single_Quote"); + if (attr & (1 << WBP_HL)) + fprintf (stream, " Hebrew_Letter"); fprintf (stream, "\n"); } } @@ -7671,6 +7722,9 @@ fill_org_wbp (const char *wordbreakproperty_filename) PROP ("Numeric", WBP_NUMERIC) PROP ("ExtendNumLet", WBP_EXTENDNUMLET) PROP ("Regional_Indicator", WBP_RI) + PROP ("Double_Quote", WBP_DQ) + PROP ("Single_Quote", WBP_SQ) + PROP ("Hebrew_Letter", WBP_HL) #undef PROP { fprintf (stderr, "unknown property value '%s' in '%s'\n", propname, @@ -7718,6 +7772,9 @@ debug_output_org_wbp (FILE *stream) PROP ("Numeric", WBP_NUMERIC) PROP ("ExtendNumLet", WBP_EXTENDNUMLET) PROP ("Regional_Indicator", WBP_RI) + PROP ("Double_Quote", WBP_DQ) + PROP ("Single_Quote", WBP_SQ) + PROP ("Hebrew_Letter", WBP_HL) #undef PROP fprintf (stream, " ??"); fprintf (stream, "\n"); @@ -7871,6 +7928,9 @@ output_wbp (FILE *stream) CASE(WBP_NUMERIC); CASE(WBP_EXTENDNUMLET); CASE(WBP_RI); + CASE(WBP_DQ); + CASE(WBP_SQ); + CASE(WBP_HL); #undef CASE default: abort (); diff --git a/lib/unictype.in.h b/lib/unictype.in.h index 30c71aa..c31d9e5 100644 --- a/lib/unictype.in.h +++ b/lib/unictype.in.h @@ -312,7 +312,11 @@ enum UC_BIDI_B, /* Paragraph Separator */ UC_BIDI_S, /* Segment Separator */ UC_BIDI_WS, /* Whitespace */ - UC_BIDI_ON /* Other Neutral */ + UC_BIDI_ON, /* Other Neutral */ + UC_BIDI_LRI, /* Left-to-Right Isolate */ + UC_BIDI_RLI, /* Right-to-Left Isolate */ + UC_BIDI_FSI, /* First Strong Isolate */ + UC_BIDI_PDI /* Pop Directional Isolate */ }; /* Return the name of a bidi class. */ diff --git a/lib/unictype/bidi_byname.gperf b/lib/unictype/bidi_byname.gperf index 9cacacf..5bb0faa 100644 --- a/lib/unictype/bidi_byname.gperf +++ b/lib/unictype/bidi_byname.gperf @@ -19,14 +19,18 @@ CS, UC_BIDI_CS EN, UC_BIDI_EN ES, UC_BIDI_ES ET, UC_BIDI_ET +FSI, UC_BIDI_FSI L, UC_BIDI_L LRE, UC_BIDI_LRE +LRI, UC_BIDI_LRI LRO, UC_BIDI_LRO NSM, UC_BIDI_NSM ON, UC_BIDI_ON PDF, UC_BIDI_PDF +PDI, UC_BIDI_PDI R, UC_BIDI_R RLE, UC_BIDI_RLE +RLI, UC_BIDI_RLI RLO, UC_BIDI_RLO S, UC_BIDI_S WS, UC_BIDI_WS @@ -46,10 +50,14 @@ European Separator, UC_BIDI_ES EuropeanSeparator, UC_BIDI_ES European Terminator, UC_BIDI_ET EuropeanTerminator, UC_BIDI_ET +First Strong Isolate, UC_BIDI_FSI +FirstStrongIsolate, UC_BIDI_FSI Left To Right, UC_BIDI_L LeftToRight, UC_BIDI_L Left To Right Embedding, UC_BIDI_LRE LeftToRightEmbedding, UC_BIDI_LRE +Left To Right Isolate, UC_BIDI_LRI +LeftToRightIsolate, UC_BIDI_LRI Left To Right Override, UC_BIDI_LRO LeftToRightOverride, UC_BIDI_LRO Nonspacing Mark, UC_BIDI_NSM @@ -58,10 +66,14 @@ Other Neutral, UC_BIDI_ON OtherNeutral, UC_BIDI_ON Pop Directional Format, UC_BIDI_PDF PopDirectionalFormat, UC_BIDI_PDF +Pop Directional Isolate, UC_BIDI_PDI +PopDirectionalIsolate, UC_BIDI_PDI Right To Left, UC_BIDI_R RightToLeft, UC_BIDI_R Right To Left Embedding, UC_BIDI_RLE RightToLeftEmbedding, UC_BIDI_RLE +Right To Left Isolate, UC_BIDI_RLI +RightToLeftIsolate, UC_BIDI_RLI Right To Left Override, UC_BIDI_RLO RightToLeftOverride, UC_BIDI_RLO Segment Separator, UC_BIDI_S diff --git a/lib/uniwbrk.in.h b/lib/uniwbrk.in.h index c272d48..9abea42 100644 --- a/lib/uniwbrk.in.h +++ b/lib/uniwbrk.in.h @@ -50,7 +50,10 @@ enum WBP_MIDNUM = 5, WBP_NUMERIC = 6, WBP_EXTENDNUMLET = 7, - WBP_RI = 13 + WBP_RI = 13, + WBP_DQ = 14, + WBP_SQ = 15, + WBP_HL = 16 }; /* Return the Word_Break property of a Unicode character. */ diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h index 04d2738..b043d68 100644 --- a/lib/uniwbrk/u-wordbreaks.h +++ b/lib/uniwbrk/u-wordbreaks.h @@ -69,31 +69,41 @@ FUNC (const UNIT *s, size_t n, char *p) secondlast last current - ALetter (MidLetter | MidNumLet) × ALetter (WB7) - ALetter × (MidLetter | MidNumLet) ALetter (WB6) - Numeric (MidNum | MidNumLet) × Numeric (WB11) - Numeric × (MidNum | MidNumLet) Numeric (WB12) - ALetter × ALetter (WB5) - ALetter × Numeric (WB9) - Numeric × ALetter (WB10) + (ALetter | HL) (MidLetter | MidNumLet | SQ) × (ALetter | HL) (WB7) + (ALetter | HL) × (MidLetter | MidNumLet | SQ) (ALetter | HL) (WB6) + Numeric (MidNum | MidNumLet | SQ) × Numeric (WB11) + Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12) + HL × DQ HL (WB7b) + HL DQ × HL (WB7c) + (ALetter | HL) × (ALetter | HL) (WB5) + (ALetter | HL) × Numeric (WB9) + Numeric × (ALetter | HL) (WB10) Numeric × Numeric (WB8) + HL × SQ (WB7a) Katakana × Katakana (WB13) - (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) + (ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a) ExtendNumLet × ExtendNumLet (WB13a) - ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b) + ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b) Regional_Indicator × Regional_Indicator (WB13c) */ /* No break across certain punctuation. Also, disable word breaks that were recognized earlier (due to lookahead of only one complex character). */ - if ((prop == WBP_ALETTER + if (((prop == WBP_ALETTER + || prop == WBP_HL) && (last_compchar_prop == WBP_MIDLETTER - || last_compchar_prop == WBP_MIDNUMLET) - && secondlast_compchar_prop == WBP_ALETTER) + || last_compchar_prop == WBP_MIDNUMLET + || last_compchar_prop == WBP_SQ) + && (secondlast_compchar_prop == WBP_ALETTER + || secondlast_compchar_prop == WBP_HL)) || (prop == WBP_NUMERIC && (last_compchar_prop == WBP_MIDNUM - || last_compchar_prop == WBP_MIDNUMLET) - && secondlast_compchar_prop == WBP_NUMERIC)) + || last_compchar_prop == WBP_MIDNUMLET + || last_compchar_prop == WBP_SQ) + && secondlast_compchar_prop == WBP_NUMERIC) + || (prop == WBP_HL + && last_compchar_prop == WBP_DQ + && secondlast_compchar_prop == WBP_HL)) { *last_compchar_ptr = 0; /* *p = 0; */ diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c index 04bd0e5..baeed58 100644 --- a/lib/uniwbrk/wbrktable.c +++ b/lib/uniwbrk/wbrktable.c @@ -22,31 +22,35 @@ /* This table contains the following rules (see UAX #29): - last current - - ALetter × ALetter (WB5) - ALetter × Numeric (WB9) - Numeric × ALetter (WB10) - Numeric × Numeric (WB8) - Katakana × Katakana (WB13) - (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) - ExtendNumLet × ExtendNumLet (WB13a) - ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b) - Regional_Indicator × Regional_Indicator (WB13c) + last current + + (ALetter | HL) × (ALetter | HL) (WB5) + (ALetter | HL) × Numeric (WB9) + HL × SQ (WB7a) + Numeric × (ALetter | HL) (WB10) + Numeric × Numeric (WB8) + Katakana × Katakana (WB13) +(ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a) + ExtendNumLet × ExtendNumLet (WB13a) + ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b) + Regional_Indicator × Regional_Indicator (WB13c) */ -const unsigned char uniwbrk_table[9][9] = -{ /* current: OTHER MIDNUMLET NUMERIC */ - /* KATAKANA MIDLETTER EXTENDNUMLET */ - /* ALETTER MIDNUM RI */ +const unsigned char uniwbrk_table[12][12] = +{ /* current: OTHER MIDNUMLET NUMERIC DQ */ + /* KATAKANA MIDLETTER EXTENDNUMLET SQ */ + /* ALETTER MIDNUM RI HL */ /* last */ - /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0, 1 }, - /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0, 1 }, - /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0, 1 }, - /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0, 1 }, - /* WBP_RI */ { 1, 1, 1, 1, 1, 1, 1, 1, 0 } + /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 }, + /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0 }, + /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0 }, + /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0 }, + /* WBP_RI */ { 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1 }, + /* WBP_DQ */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_SQ */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_HL */ { 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0 } }; diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h index 50b7823..567a031 100644 --- a/lib/uniwbrk/wbrktable.h +++ b/lib/uniwbrk/wbrktable.h @@ -15,4 +15,4 @@ You should have received a copy of the GNU Lesser General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -extern const unsigned char uniwbrk_table[9][9]; +extern const unsigned char uniwbrk_table[12][12]; diff --git a/tests/uniwbrk/test-uc-wordbreaks.c b/tests/uniwbrk/test-uc-wordbreaks.c index 41585f7..fc9dd59 100644 --- a/tests/uniwbrk/test-uc-wordbreaks.c +++ b/tests/uniwbrk/test-uc-wordbreaks.c @@ -48,6 +48,9 @@ wordbreakproperty_to_string (int wbp) CASE(NUMERIC) CASE(EXTENDNUMLET) CASE(RI) + CASE(DQ) + CASE(SQ) + CASE(HL) } abort (); } -- 1.9.3