* lib/unilbrk/lbrktables.h (LBP_RI): New enumeration value. (unilbrk_table): Adjust table size. * lib/unilbrk/lbrktables.c (unilbrk_table): Add a row and column for LBP_RI.
* lib/uniwbrk.in.h (WBP_RI): New enumeration value. * lib/uniwbrk/u-wordbreaks.h (FUNC): Support rule WB13c. Normalize table index skipping ignored properties. * lib/uniwbrk/wbrktable.c (uniwbrk_table): Support WBP_RI. Remove WBP_EXTEND and WBP_FORMAT, which are now computed without using the table. * lib/uniwbrk/wbrktable.h: Adjust table size. * tests/uniwbrk/test-uc-wordbreaks.c (wordbreakproperty_to_string): Support WBP_RI. * lib/unigbrk.in.h (GBP_RI): New enumeration value. * lib/unigbrk/uc-is-grapheme-break.c (UC_IS_GRAPHEME_BREAK): Support rule GB8a. (UC_GRAPHEME_BREAKS_FOR, gb_table): Support GBP_RI. * tests/unigbrk/test-uc-is-grapheme-break.c (graphemebreakproperty_to_string): Support GBP_RI. * lib/gen-uni-tables.c (LBP_RI): New enumeration value. (get_lbp, debug_output_lbp, fill_org_lbp, debug_output_org_lbp) (output_lbp): Support LBP_RI. (WBP_RI): New enumeration value. (debug_output_wbp, fill_org_wbp, debug_output_org_wbp) (output_wbp): Support WBP_RI. (GBP_RI): New enumeration value. (output_gbp_test, fill_org_gbp): Support GBP_RI. --- lib/gen-uni-tables.c | 49 ++++++++++++++++++-------- lib/unigbrk.in.h | 3 +- lib/unigbrk/uc-is-grapheme-break.c | 9 +++-- lib/unilbrk/lbrktables.c | 57 ++++++++++++++++--------------- lib/unilbrk/lbrktables.h | 21 ++++++------ lib/uniwbrk.in.h | 3 +- lib/uniwbrk/u-wordbreaks.h | 36 +++++++++++++------ lib/uniwbrk/wbrktable.c | 24 ++++++------- lib/uniwbrk/wbrktable.h | 2 +- tests/unigbrk/test-uc-gbrk-prop.c | 1 + tests/unigbrk/test-uc-is-grapheme-break.c | 1 + tests/uniwbrk/test-uc-wordbreaks.c | 1 + 12 files changed, 127 insertions(+), 80 deletions(-) diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index ec1aba5..f833777 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -32,7 +32,7 @@ /usr/local/share/Unidata/CompositionExclusions.txt \ /usr/local/share/Unidata/SpecialCasing.txt \ /usr/local/share/Unidata/CaseFolding.txt \ - 6.1.0 + 6.2.0 */ #include <stdbool.h> @@ -6213,22 +6213,22 @@ output_width_property_test (const char *filename) enum { - /* Values >= 26 are resolved at run time. */ - LBP_BK = 26, /* mandatory break */ + /* Values >= 27 are resolved at run time. */ + LBP_BK = 27, /* mandatory break */ /*LBP_CR, carriage return - not used here because it's a DOSism */ /*LBP_LF, line feed - not used here because it's a DOSism */ - LBP_CM = 27, /* attached characters and combining marks */ + LBP_CM = 28, /* attached characters and combining marks */ /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */ /*LBP_SG, surrogates - not used here because they are not characters */ LBP_WJ = 0, /* word joiner */ - LBP_ZW = 28, /* zero width space */ + LBP_ZW = 29, /* zero width space */ LBP_GL = 1, /* non-breaking (glue) */ - LBP_SP = 29, /* space */ + LBP_SP = 30, /* space */ LBP_B2 = 2, /* break opportunity before and after */ LBP_BA = 3, /* break opportunity after */ LBP_BB = 4, /* break opportunity before */ LBP_HY = 5, /* hyphen */ - LBP_CB = 30, /* contingent break opportunity */ + LBP_CB = 31, /* contingent break opportunity */ LBP_CL = 6, /* closing punctuation */ LBP_CP = 7, /* closing parenthesis */ LBP_EX = 8, /* exclamation/interrogation */ @@ -6241,7 +6241,7 @@ enum LBP_PO = 15, /* postfix (numeric) */ LBP_PR = 16, /* prefix (numeric) */ LBP_SY = 17, /* symbols allowing breaks */ - LBP_AI = 31, /* ambiguous (alphabetic or ideograph) */ + LBP_AI = 32, /* ambiguous (alphabetic or ideograph) */ LBP_AL = 18, /* ordinary alphabetic and symbol characters */ /*LBP_CJ, conditional Japanese starter, resolved to NS */ LBP_H2 = 19, /* Hangul LV syllable */ @@ -6251,8 +6251,9 @@ enum LBP_JL = 22, /* Hangul L Jamo */ LBP_JV = 23, /* Hangul V Jamo */ LBP_JT = 24, /* Hangul T Jamo */ - LBP_SA = 32, /* complex context (South East Asian) */ - LBP_XX = 33 /* unknown */ + LBP_RI = 26, /* regional indicator */ + LBP_SA = 33, /* complex context (South East Asian) */ + LBP_XX = 34 /* unknown */ }; /* Returns the line breaking classification for ch, as a bit mask. */ @@ -6710,6 +6711,10 @@ get_lbp (unsigned int ch) if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB)) attr |= (int64_t) 1 << LBP_JT; + /* regional indicator */ + if (ch >= 0x1F1E6 && ch <= 0x1F1FF) + attr |= (int64_t) 1 << LBP_RI; + /* complex context (South East Asian) */ if (((unicode_attributes[ch].category[0] == 'C' && unicode_attributes[ch].category[1] == 'f') @@ -6862,7 +6867,7 @@ get_lbp (unsigned int ch) || ch == 0x2064 /* INVISIBLE PLUS */ /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0x110BD /* KAITHI NUMBER SIGN */) - if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID)))) + if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID)))) { /* ambiguous (alphabetic) ? */ if ((unicode_width[ch] != NULL @@ -6987,6 +6992,7 @@ debug_output_lbp (FILE *stream) PRINT_BIT(attr,LBP_JL); PRINT_BIT(attr,LBP_JV); PRINT_BIT(attr,LBP_JT); + PRINT_BIT(attr,LBP_RI); PRINT_BIT(attr,LBP_SA); PRINT_BIT(attr,LBP_XX); #undef PRINT_BIT @@ -7102,6 +7108,7 @@ fill_org_lbp (const char *linebreak_filename) TRY(LBP_JL) TRY(LBP_JV) TRY(LBP_JT) + TRY(LBP_RI) TRY(LBP_SA) TRY(LBP_XX) #undef TRY @@ -7184,6 +7191,7 @@ debug_output_org_lbp (FILE *stream) PRINT_BIT(attr,LBP_JL); PRINT_BIT(attr,LBP_JV); PRINT_BIT(attr,LBP_JT); + PRINT_BIT(attr,LBP_RI); PRINT_BIT(attr,LBP_SA); PRINT_BIT(attr,LBP_XX); #undef PRINT_BIT @@ -7358,6 +7366,7 @@ output_lbp (FILE *stream1, FILE *stream2) CASE(LBP_JL); CASE(LBP_JV); CASE(LBP_JT); + CASE(LBP_RI); CASE(LBP_SA); CASE(LBP_XX); #undef CASE @@ -7457,7 +7466,8 @@ enum WBP_MIDLETTER = 4, WBP_MIDNUM = 5, WBP_NUMERIC = 6, - WBP_EXTENDNUMLET = 7 + WBP_EXTENDNUMLET = 7, + WBP_RI = 13 }; /* Returns the word breaking property for ch, as a bit mask. */ @@ -7525,6 +7535,9 @@ get_wbp (unsigned int ch) if (unicode_attributes[ch].category != NULL && strcmp (unicode_attributes[ch].category, "Pc") == 0) attr |= 1 << WBP_EXTENDNUMLET; + + if (((get_lbp (ch) >> LBP_RI) & 1) != 0) + attr |= 1 << WBP_RI; } if (attr == 0) @@ -7570,7 +7583,9 @@ debug_output_wbp (FILE *stream) fprintf (stream, " Numeric"); if (attr & (1 << WBP_EXTENDNUMLET)) fprintf (stream, " ExtendNumLet"); - fprintf (stream, "\n"); + if (attr & (1 << WBP_RI)) + fprintf (stream, " Regional_Indicator"); + fprintf (stream, "\n"); } } } @@ -7655,6 +7670,7 @@ fill_org_wbp (const char *wordbreakproperty_filename) PROP ("MidNum", WBP_MIDNUM) PROP ("Numeric", WBP_NUMERIC) PROP ("ExtendNumLet", WBP_EXTENDNUMLET) + PROP ("Regional_Indicator", WBP_RI) #undef PROP { fprintf (stderr, "unknown property value '%s' in '%s'\n", propname, @@ -7701,6 +7717,7 @@ debug_output_org_wbp (FILE *stream) PROP ("MidNum", WBP_MIDNUM) PROP ("Numeric", WBP_NUMERIC) PROP ("ExtendNumLet", WBP_EXTENDNUMLET) + PROP ("Regional_Indicator", WBP_RI) #undef PROP fprintf (stream, " ??"); fprintf (stream, "\n"); @@ -7853,6 +7870,7 @@ output_wbp (FILE *stream) CASE(WBP_MIDNUM); CASE(WBP_NUMERIC); CASE(WBP_EXTENDNUMLET); + CASE(WBP_RI); #undef CASE default: abort (); @@ -7933,7 +7951,8 @@ enum GBP_V = 8, GBP_T = 9, GBP_LV = 10, - GBP_LVT = 11 + GBP_LVT = 11, + GBP_RI = 12 }; /* Construction of sparse 3-level tables. */ @@ -8004,6 +8023,7 @@ output_gbp_test (const char *filename) CASE (GBP_T) CASE (GBP_LV) CASE (GBP_LVT) + CASE (GBP_RI) #undef CASE default: abort (); @@ -8201,6 +8221,7 @@ fill_org_gbp (const char *graphemebreakproperty_filename) PROP ("T", GBP_T) PROP ("LV", GBP_LV) PROP ("LVT", GBP_LVT) + PROP ("Regional_Indicator", GBP_RI) #undef PROP { fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname, diff --git a/lib/unigbrk.in.h b/lib/unigbrk.in.h index 8335e5a..a708a8c 100644 --- a/lib/unigbrk.in.h +++ b/lib/unigbrk.in.h @@ -51,7 +51,8 @@ enum GBP_V = 8, GBP_T = 9, GBP_LV = 10, - GBP_LVT = 11 + GBP_LVT = 11, + GBP_RI = 12 }; /* Return the Grapheme_Cluster_Break property of a Unicode character. */ diff --git a/lib/unigbrk/uc-is-grapheme-break.c b/lib/unigbrk/uc-is-grapheme-break.c index 0e61e79..7d1759c 100644 --- a/lib/unigbrk/uc-is-grapheme-break.c +++ b/lib/unigbrk/uc-is-grapheme-break.c @@ -47,6 +47,9 @@ /* GB8 */ \ ((A) == GBP_LVT || (A) == GBP_T) && (B) == GBP_T ? false : \ \ + /* GB8a */ \ + (A) == GBP_RI && (B) == GBP_RI ? false : \ + \ /* GB9 */ \ (B) == GBP_EXTEND ? false : \ \ @@ -71,9 +74,10 @@ | (UC_IS_GRAPHEME_BREAK(A, GBP_V) << GBP_V) \ | (UC_IS_GRAPHEME_BREAK(A, GBP_T) << GBP_T) \ | (UC_IS_GRAPHEME_BREAK(A, GBP_LV) << GBP_LV) \ - | (UC_IS_GRAPHEME_BREAK(A, GBP_LVT) << GBP_LVT)) + | (UC_IS_GRAPHEME_BREAK(A, GBP_LVT) << GBP_LVT) \ + | (UC_IS_GRAPHEME_BREAK(A, GBP_RI) << GBP_RI)) -static const unsigned short int gb_table[12] = +static const unsigned short int gb_table[13] = { UC_GRAPHEME_BREAKS_FOR(0), /* GBP_OTHER */ UC_GRAPHEME_BREAKS_FOR(1), /* GBP_CR */ @@ -87,6 +91,7 @@ static const unsigned short int gb_table[12] = UC_GRAPHEME_BREAKS_FOR(9), /* GBP_T */ UC_GRAPHEME_BREAKS_FOR(10), /* GBP_LV */ UC_GRAPHEME_BREAKS_FOR(11), /* GBP_LVT */ + UC_GRAPHEME_BREAKS_FOR(12), /* GBP_RI */ }; bool diff --git a/lib/unilbrk/lbrktables.c b/lib/unilbrk/lbrktables.c index d60321d..f4a55a3 100644 --- a/lib/unilbrk/lbrktables.c +++ b/lib/unilbrk/lbrktables.c @@ -23,36 +23,37 @@ /* Define unilbrkprop, table of line breaking properties. */ #include "unilbrk/lbrkprop2.h" -const unsigned char unilbrk_table[26][26] = +const unsigned char unilbrk_table[27][27] = { /* after */ - /* WJ GL B2 BA BB HY CL CP EX IN NS OP QU IS NU PO PR SY AL H2 H3 ID JL JV JT HL */ -/* WJ */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, I, I, I, I, }, -/* GL */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, I, I, I, I, }, -/* B2 */ { P, I, P, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, }, -/* BA */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, }, -/* BB */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, I, I, I, I, }, -/* HY */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, D, D, D, D, }, -/* CL */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, D, I, I, P, D, D, D, D, D, D, D, D, }, -/* CP */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, I, I, I, P, I, D, D, D, D, D, D, I, }, -/* EX */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, }, -/* IN */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, }, -/* NS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, }, -/* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, }, -/* QU */ { P, I, I, I, I, I, P, P, P, I, I, P, I, P, I, I, I, P, I, I, I, I, I, I, I, I, }, -/* IS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, D, D, D, D, }, -/* NU */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, I, I, P, I, D, D, D, D, D, D, I, }, -/* PO */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, D, D, D, D, D, D, I, }, -/* PR */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, I, I, I, I, I, I, I, }, -/* SY */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, D, D, D, D, }, -/* AL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D, D, D, D, I, }, -/* H2 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, I, I, D, }, -/* H3 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, I, D, }, -/* ID */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, D, D, }, -/* JL */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, I, I, D, I, I, D, D, }, -/* JV */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, I, I, D, }, -/* JT */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, I, D, }, -/* HL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D, D, D, D, I, }, + /* WJ GL B2 BA BB HY CL CP EX IN NS OP QU IS NU PO PR SY AL H2 H3 ID JL JV JT HL RI */ +/* WJ */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, I, I, I, I, I, }, +/* GL */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, I, I, I, I, I, }, +/* B2 */ { P, I, P, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, }, +/* BA */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, }, +/* BB */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, I, I, I, I, I, }, +/* HY */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, D, D, D, D, D, }, +/* CL */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, D, I, I, P, D, D, D, D, D, D, D, D, D, }, +/* CP */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, I, I, I, P, I, D, D, D, D, D, D, I, D, }, +/* EX */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, }, +/* IN */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, }, +/* NS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, }, +/* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, }, +/* QU */ { P, I, I, I, I, I, P, P, P, I, I, P, I, P, I, I, I, P, I, I, I, I, I, I, I, I, I, }, +/* IS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, D, D, D, D, D, }, +/* NU */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, I, I, P, I, D, D, D, D, D, D, I, D, }, +/* PO */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, D, D, D, D, D, D, I, D, }, +/* PR */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, I, I, I, I, I, I, I, D, }, +/* SY */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, D, D, D, D, D, }, +/* AL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D, D, D, D, I, D, }, +/* H2 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, I, I, D, D, }, +/* H3 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, I, D, D, }, +/* ID */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, D, D, D, }, +/* JL */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, I, I, D, I, I, D, D, D, }, +/* JV */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, I, I, D, D, }, +/* JT */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, I, D, D, }, +/* HL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D, D, D, D, I, D, }, +/* RI */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, I, D, I, }, /* "" */ /* before */ }; diff --git a/lib/unilbrk/lbrktables.h b/lib/unilbrk/lbrktables.h index 95bb502..9c76ad7 100644 --- a/lib/unilbrk/lbrktables.h +++ b/lib/unilbrk/lbrktables.h @@ -21,22 +21,22 @@ enum { - /* Values >= 26 are resolved at run time. */ - LBP_BK = 26, /* mandatory break */ + /* Values >= 27 are resolved at run time. */ + LBP_BK = 27, /* mandatory break */ /*LBP_CR, carriage return - not used here because it's a DOSism */ /*LBP_LF, line feed - not used here because it's a DOSism */ - LBP_CM = 27, /* attached characters and combining marks */ + LBP_CM = 28, /* attached characters and combining marks */ /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */ /*LBP_SG, surrogates - not used here because they are not characters */ LBP_WJ = 0, /* word joiner */ - LBP_ZW = 28, /* zero width space */ + LBP_ZW = 29, /* zero width space */ LBP_GL = 1, /* non-breaking (glue) */ - LBP_SP = 29, /* space */ + LBP_SP = 30, /* space */ LBP_B2 = 2, /* break opportunity before and after */ LBP_BA = 3, /* break opportunity after */ LBP_BB = 4, /* break opportunity before */ LBP_HY = 5, /* hyphen */ - LBP_CB = 30, /* contingent break opportunity */ + LBP_CB = 31, /* contingent break opportunity */ LBP_CL = 6, /* closing punctuation */ LBP_CP = 7, /* closing parenthesis */ LBP_EX = 8, /* exclamation/interrogation */ @@ -49,7 +49,7 @@ enum LBP_PO = 15, /* postfix (numeric) */ LBP_PR = 16, /* prefix (numeric) */ LBP_SY = 17, /* symbols allowing breaks */ - LBP_AI = 31, /* ambiguous (alphabetic or ideograph) */ + LBP_AI = 32, /* ambiguous (alphabetic or ideograph) */ LBP_AL = 18, /* ordinary alphabetic and symbol characters */ /*LBP_CJ, conditional Japanese starters, resolved to NS */ LBP_H2 = 19, /* Hangul LV syllable */ @@ -59,8 +59,9 @@ enum LBP_JL = 22, /* Hangul L Jamo */ LBP_JV = 23, /* Hangul V Jamo */ LBP_JT = 24, /* Hangul T Jamo */ - LBP_SA = 32, /* complex context (South East Asian) */ - LBP_XX = 33 /* unknown */ + LBP_RI = 26, /* regional indicator */ + LBP_SA = 33, /* complex context (South East Asian) */ + LBP_XX = 34 /* unknown */ }; #include "lbrkprop1.h" @@ -91,7 +92,7 @@ unilbrkprop_lookup (ucs4_t uc) #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */ #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */ -extern const unsigned char unilbrk_table[26][26]; +extern const unsigned char unilbrk_table[27][27]; /* We don't support line breaking of complex-context dependent characters (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */ diff --git a/lib/uniwbrk.in.h b/lib/uniwbrk.in.h index ab4b532..c272d48 100644 --- a/lib/uniwbrk.in.h +++ b/lib/uniwbrk.in.h @@ -49,7 +49,8 @@ enum WBP_MIDLETTER = 4, WBP_MIDNUM = 5, WBP_NUMERIC = 6, - WBP_EXTENDNUMLET = 7 + WBP_EXTENDNUMLET = 7, + WBP_RI = 13 }; /* Return the Word_Break property of a Unicode character. */ diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h index 33ca7eb..04d2738 100644 --- a/lib/uniwbrk/u-wordbreaks.h +++ b/lib/uniwbrk/u-wordbreaks.h @@ -55,16 +55,12 @@ FUNC (const UNIT *s, size_t n, char *p) if (last_char_prop == WBP_CR && prop == WBP_LF) /* *p = 0 */; /* Break before and after newlines. */ - else if (last_char_prop >= WBP_NEWLINE - /* same as: - last_char_prop == WBP_CR - || last_char_prop == WBP_LF - || last_char_prop == WBP_NEWLINE */ - || prop >= WBP_NEWLINE - /* same as: - prop == WBP_CR - || prop == WBP_LF - || prop == WBP_NEWLINE */) + else if ((last_char_prop == WBP_CR + || last_char_prop == WBP_LF + || last_char_prop == WBP_NEWLINE) + || (prop == WBP_CR + || prop == WBP_LF + || prop == WBP_NEWLINE)) *p = 1; /* Ignore Format and Extend characters. */ else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) @@ -85,6 +81,7 @@ FUNC (const UNIT *s, size_t n, char *p) (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) ExtendNumLet × ExtendNumLet (WB13a) ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b) + Regional_Indicator × Regional_Indicator (WB13c) */ /* No break across certain punctuation. Also, disable word breaks that were recognized earlier (due to lookahead of @@ -101,10 +98,27 @@ FUNC (const UNIT *s, size_t n, char *p) *last_compchar_ptr = 0; /* *p = 0; */ } + /* Break after Format and Extend characters. */ + else if (last_compchar_prop == WBP_EXTEND + || last_compchar_prop == WBP_FORMAT) + *p = 1; else { + /* Normalize property value to table index, + skipping 5 properties: WBP_EXTEND, + WBP_FORMAT, WBP_NEWLINE, WBP_CR, and + WBP_LF. */ + int last_compchar_prop_index = last_compchar_prop; + int prop_index = prop; + + if (last_compchar_prop_index >= WBP_EXTEND) + last_compchar_prop_index -= 5; + + if (prop_index >= WBP_EXTEND) + prop_index -= 5; + /* Perform a single table lookup. */ - if (uniwbrk_table[last_compchar_prop][prop]) + if (uniwbrk_table[last_compchar_prop_index][prop_index]) *p = 1; /* else *p = 0; */ } diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c index 7cbe4d6..04bd0e5 100644 --- a/lib/uniwbrk/wbrktable.c +++ b/lib/uniwbrk/wbrktable.c @@ -32,21 +32,21 @@ (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) ExtendNumLet × ExtendNumLet (WB13a) ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b) + Regional_Indicator × Regional_Indicator (WB13c) */ -const unsigned char uniwbrk_table[10][8] = +const unsigned char uniwbrk_table[9][9] = { /* current: OTHER MIDNUMLET NUMERIC */ /* KATAKANA MIDLETTER EXTENDNUMLET */ - /* ALETTER MIDNUM */ + /* ALETTER MIDNUM RI */ /* last */ - /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0 }, - /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0 }, - /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0 }, - /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0 }, - /* WBP_EXTEND */ { 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_FORMAT */ { 1, 1, 1, 1, 1, 1, 1, 1 } + /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0, 1 }, + /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0, 1 }, + /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0, 1 }, + /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0, 1 }, + /* WBP_RI */ { 1, 1, 1, 1, 1, 1, 1, 1, 0 } }; diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h index 1b48adf..50b7823 100644 --- a/lib/uniwbrk/wbrktable.h +++ b/lib/uniwbrk/wbrktable.h @@ -15,4 +15,4 @@ You should have received a copy of the GNU Lesser General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -extern const unsigned char uniwbrk_table[10][8]; +extern const unsigned char uniwbrk_table[9][9]; diff --git a/tests/unigbrk/test-uc-gbrk-prop.c b/tests/unigbrk/test-uc-gbrk-prop.c index 1c71280..4bfbdba 100644 --- a/tests/unigbrk/test-uc-gbrk-prop.c +++ b/tests/unigbrk/test-uc-gbrk-prop.c @@ -50,6 +50,7 @@ graphemebreakproperty_to_string (int gbp) CASE(T) CASE(LV) CASE(LVT) + CASE(RI) } abort (); } diff --git a/tests/unigbrk/test-uc-is-grapheme-break.c b/tests/unigbrk/test-uc-is-grapheme-break.c index a93f6f2..dbaf3dc 100644 --- a/tests/unigbrk/test-uc-is-grapheme-break.c +++ b/tests/unigbrk/test-uc-is-grapheme-break.c @@ -44,6 +44,7 @@ graphemebreakproperty_to_string (int gbp) CASE(T) CASE(LV) CASE(LVT) + CASE(RI) } abort (); } diff --git a/tests/uniwbrk/test-uc-wordbreaks.c b/tests/uniwbrk/test-uc-wordbreaks.c index 736cdba..41585f7 100644 --- a/tests/uniwbrk/test-uc-wordbreaks.c +++ b/tests/uniwbrk/test-uc-wordbreaks.c @@ -47,6 +47,7 @@ wordbreakproperty_to_string (int wbp) CASE(MIDNUM) CASE(NUMERIC) CASE(EXTENDNUMLET) + CASE(RI) } abort (); } -- 1.9.3