[PATCH v2 09/10] Update to Unicode 6.3.0

Daiki Ueno Thu, 23 Oct 2014 01:02:40 -0700

* tests/uniwbrk/test-uc-wordbreaks.c
(wordbreakproperty_to_string): Support WBP_DQ, WBP_SQ, and WBP_HL.


* lib/uniwbrk.in.h (WBP_DQ, WBP_SQ, WBP_HL): New enumeration values.
* lib/uniwbrk/u-wordbreaks.h (FUNC): Support WB7a, WB7b, and WB7c.
Update WB5, WB6, WB7, WB9, WB11, WB12, WB13a, and WB13b.
* lib/uniwbrk/wbrktable.h (uniwbrk_table): Adjust table size.
* lib/uniwbrk/wbrktable.c (uniwbrk_table): Support rule WB7a.
Update WB5, WB9, WB10, WB13a, and WB13b.

* lib/gen-uni-tables.c (UC_BIDI_LRI, UC_BIDI_RLI, UC_BIDI_FSI)
(UC_BIDI_PDI): New enumeration values.
(bidi_category_byname): Support those enum values.
(is_WBP_MIDNUMLET): Exclude 0x0027 (SINGLE QUOTE), which is now a
dedicated property assigned.
(is_property_case_ignorable): Check 0x0027.
(WBP_DQ, WBP_SQ, WBP_HL): New enumeration values.
(get_wbp, debug_output_wbp, fill_org_wbp, debug_output_org_wbp)
(output_wbp): Support those enum values.

* lib/unictype.in.h (UC_BIDI_LRI, UC_BIDI_RLI, UC_BIDI_FSI)
(UC_BIDI_PDI): New enumeration values.
* lib/unictype/bidi_byname.gperf: Add those property names.
---
 lib/gen-uni-tables.c               | 76 ++++++++++++++++++++++++++++++++++----
 lib/unictype.in.h                  |  6 ++-
 lib/unictype/bidi_byname.gperf     | 12 ++++++
 lib/uniwbrk.in.h                   |  5 ++-
 lib/uniwbrk/u-wordbreaks.h         | 38 ++++++++++++-------
 lib/uniwbrk/wbrktable.c            | 52 ++++++++++++++------------
 lib/uniwbrk/wbrktable.h            |  2 +-
 tests/uniwbrk/test-uc-wordbreaks.c |  3 ++
 8 files changed, 145 insertions(+), 49 deletions(-)

diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index f833777..af541d1 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -32,7 +32,7 @@
                       /usr/local/share/Unidata/CompositionExclusions.txt \
                       /usr/local/share/Unidata/SpecialCasing.txt \
                       /usr/local/share/Unidata/CaseFolding.txt \
-                      6.2.0
+                      6.3.0
  */
 
 #include <stdbool.h>
@@ -1307,7 +1307,11 @@ enum
   UC_BIDI_B,   /* Paragraph Separator */
   UC_BIDI_S,   /* Segment Separator */
   UC_BIDI_WS,  /* Whitespace */
-  UC_BIDI_ON   /* Other Neutral */
+  UC_BIDI_ON,  /* Other Neutral */
+  UC_BIDI_LRI, /* Left-to-Right Isolate */
+  UC_BIDI_RLI, /* Right-to-Left Isolate */
+  UC_BIDI_FSI, /* First Strong Isolate */
+  UC_BIDI_PDI  /* Pop Directional Isolate */
 };
 
 static int
@@ -1365,7 +1369,20 @@ bidi_category_byname (const char *category_name)
           break;
         }
       break;
-    case 'L':
+    case 'F':
+      switch (category_name[1])
+        {
+        case 'S':
+          switch (category_name[2])
+            {
+            case 'I':
+              if (category_name[3] == '\0')
+                return UC_BIDI_FSI;
+              break;
+            }
+        }
+      break;
+   case 'L':
       switch (category_name[1])
         {
         case '\0':
@@ -1381,7 +1398,11 @@ bidi_category_byname (const char *category_name)
               if (category_name[3] == '\0')
                 return UC_BIDI_LRO;
               break;
-            }
+            case 'I':
+              if (category_name[3] == '\0')
+                return UC_BIDI_LRI;
+              break;
+           }
           break;
         }
       break;
@@ -1418,6 +1439,10 @@ bidi_category_byname (const char *category_name)
               if (category_name[3] == '\0')
                 return UC_BIDI_PDF;
               break;
+            case 'I':
+              if (category_name[3] == '\0')
+                return UC_BIDI_PDI;
+              break;
             }
           break;
         }
@@ -1438,7 +1463,11 @@ bidi_category_byname (const char *category_name)
               if (category_name[3] == '\0')
                 return UC_BIDI_RLO;
               break;
-            }
+            case 'I':
+              if (category_name[3] == '\0')
+                return UC_BIDI_RLI;
+              break;
+           }
           break;
         }
       break;
@@ -2518,7 +2547,7 @@ output_mirror (const char *filename, const char *version)
 static bool
 is_WBP_MIDNUMLET (unsigned int ch)
 {
-  return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
+  return (ch == 0x002E || ch == 0x2018 || ch == 0x2019
           || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
 }
 
@@ -2999,6 +3028,7 @@ static bool
 is_property_case_ignorable (unsigned int ch)
 {
   bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
+                  || ch == 0x0027
                   || is_category_Mn (ch)
                   || is_category_Me (ch)
                   || is_category_Cf (ch)
@@ -7467,7 +7497,10 @@ enum
   WBP_MIDNUM       = 5,
   WBP_NUMERIC      = 6,
   WBP_EXTENDNUMLET = 7,
-  WBP_RI           = 13
+  WBP_RI           = 13,
+  WBP_DQ           = 14,
+  WBP_SQ           = 15,
+  WBP_HL           = 16
 };
 
 /* Returns the word breaking property for ch, as a bit mask.  */
@@ -7506,6 +7539,11 @@ get_wbp (unsigned int ch)
           || ch == 0xFF70)
         attr |= 1 << WBP_KATAKANA;
 
+      if ((unicode_scripts[ch] < numscripts
+           && strcmp (scripts[unicode_scripts[ch]], "Hebrew") == 0)
+          && strcmp (unicode_attributes[ch].category, "Lo") == 0)
+        attr |= 1 << WBP_HL;
+
       if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
            || ch == 0x05F3)
           && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
@@ -7513,7 +7551,8 @@ get_wbp (unsigned int ch)
           && ((get_lbp (ch) >> LBP_SA) & 1) == 0
           && !(unicode_scripts[ch] < numscripts
                && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
-          && (attr & (1 << WBP_EXTEND)) == 0)
+          && (attr & (1 << WBP_EXTEND)) == 0
+          && (attr & (1 << WBP_HL)) == 0)
         attr |= 1 << WBP_ALETTER;
 
       if (is_WBP_MIDNUMLET (ch))
@@ -7538,6 +7577,12 @@ get_wbp (unsigned int ch)
 
       if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
         attr |= 1 << WBP_RI;
+
+      if (ch == 0x0022)
+        attr |= 1 << WBP_DQ;
+
+      if (ch == 0x0027)
+        attr |= 1 << WBP_SQ;
     }
 
   if (attr == 0)
@@ -7585,6 +7630,12 @@ debug_output_wbp (FILE *stream)
             fprintf (stream, " ExtendNumLet");
           if (attr & (1 << WBP_RI))
             fprintf (stream, " Regional_Indicator");
+          if (attr & (1 << WBP_DQ))
+            fprintf (stream, " Double_Quote");
+          if (attr & (1 << WBP_SQ))
+            fprintf (stream, " Single_Quote");
+          if (attr & (1 << WBP_HL))
+            fprintf (stream, " Hebrew_Letter");
          fprintf (stream, "\n");
         }
     }
@@ -7671,6 +7722,9 @@ fill_org_wbp (const char *wordbreakproperty_filename)
       PROP ("Numeric", WBP_NUMERIC)
       PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
       PROP ("Regional_Indicator", WBP_RI)
+      PROP ("Double_Quote", WBP_DQ)
+      PROP ("Single_Quote", WBP_SQ)
+      PROP ("Hebrew_Letter", WBP_HL)
 #undef PROP
         {
           fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
@@ -7718,6 +7772,9 @@ debug_output_org_wbp (FILE *stream)
           PROP ("Numeric", WBP_NUMERIC)
           PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
           PROP ("Regional_Indicator", WBP_RI)
+          PROP ("Double_Quote", WBP_DQ)
+          PROP ("Single_Quote", WBP_SQ)
+          PROP ("Hebrew_Letter", WBP_HL)
 #undef PROP
           fprintf (stream, " ??");
           fprintf (stream, "\n");
@@ -7871,6 +7928,9 @@ output_wbp (FILE *stream)
           CASE(WBP_NUMERIC);
           CASE(WBP_EXTENDNUMLET);
           CASE(WBP_RI);
+          CASE(WBP_DQ);
+          CASE(WBP_SQ);
+          CASE(WBP_HL);
 #undef CASE
           default:
             abort ();
diff --git a/lib/unictype.in.h b/lib/unictype.in.h
index 30c71aa..c31d9e5 100644
--- a/lib/unictype.in.h
+++ b/lib/unictype.in.h
@@ -312,7 +312,11 @@ enum
   UC_BIDI_B,   /* Paragraph Separator */
   UC_BIDI_S,   /* Segment Separator */
   UC_BIDI_WS,  /* Whitespace */
-  UC_BIDI_ON   /* Other Neutral */
+  UC_BIDI_ON,  /* Other Neutral */
+  UC_BIDI_LRI, /* Left-to-Right Isolate */
+  UC_BIDI_RLI, /* Right-to-Left Isolate */
+  UC_BIDI_FSI, /* First Strong Isolate */
+  UC_BIDI_PDI  /* Pop Directional Isolate */
 };
 
 /* Return the name of a bidi class.  */
diff --git a/lib/unictype/bidi_byname.gperf b/lib/unictype/bidi_byname.gperf
index 9cacacf..5bb0faa 100644
--- a/lib/unictype/bidi_byname.gperf
+++ b/lib/unictype/bidi_byname.gperf
@@ -19,14 +19,18 @@ CS, UC_BIDI_CS
 EN, UC_BIDI_EN
 ES, UC_BIDI_ES
 ET, UC_BIDI_ET
+FSI, UC_BIDI_FSI
 L, UC_BIDI_L
 LRE, UC_BIDI_LRE
+LRI, UC_BIDI_LRI
 LRO, UC_BIDI_LRO
 NSM, UC_BIDI_NSM
 ON, UC_BIDI_ON
 PDF, UC_BIDI_PDF
+PDI, UC_BIDI_PDI
 R, UC_BIDI_R
 RLE, UC_BIDI_RLE
+RLI, UC_BIDI_RLI
 RLO, UC_BIDI_RLO
 S, UC_BIDI_S
 WS, UC_BIDI_WS
@@ -46,10 +50,14 @@ European Separator, UC_BIDI_ES
 EuropeanSeparator, UC_BIDI_ES
 European Terminator, UC_BIDI_ET
 EuropeanTerminator, UC_BIDI_ET
+First Strong Isolate, UC_BIDI_FSI
+FirstStrongIsolate, UC_BIDI_FSI
 Left To Right, UC_BIDI_L
 LeftToRight, UC_BIDI_L
 Left To Right Embedding, UC_BIDI_LRE
 LeftToRightEmbedding, UC_BIDI_LRE
+Left To Right Isolate, UC_BIDI_LRI
+LeftToRightIsolate, UC_BIDI_LRI
 Left To Right Override, UC_BIDI_LRO
 LeftToRightOverride, UC_BIDI_LRO
 Nonspacing Mark, UC_BIDI_NSM
@@ -58,10 +66,14 @@ Other Neutral, UC_BIDI_ON
 OtherNeutral, UC_BIDI_ON
 Pop Directional Format, UC_BIDI_PDF
 PopDirectionalFormat, UC_BIDI_PDF
+Pop Directional Isolate, UC_BIDI_PDI
+PopDirectionalIsolate, UC_BIDI_PDI
 Right To Left, UC_BIDI_R
 RightToLeft, UC_BIDI_R
 Right To Left Embedding, UC_BIDI_RLE
 RightToLeftEmbedding, UC_BIDI_RLE
+Right To Left Isolate, UC_BIDI_RLI
+RightToLeftIsolate, UC_BIDI_RLI
 Right To Left Override, UC_BIDI_RLO
 RightToLeftOverride, UC_BIDI_RLO
 Segment Separator, UC_BIDI_S
diff --git a/lib/uniwbrk.in.h b/lib/uniwbrk.in.h
index c272d48..9abea42 100644
--- a/lib/uniwbrk.in.h
+++ b/lib/uniwbrk.in.h
@@ -50,7 +50,10 @@ enum
   WBP_MIDNUM       = 5,
   WBP_NUMERIC      = 6,
   WBP_EXTENDNUMLET = 7,
-  WBP_RI           = 13
+  WBP_RI           = 13,
+  WBP_DQ           = 14,
+  WBP_SQ           = 15,
+  WBP_HL           = 16
 };
 
 /* Return the Word_Break property of a Unicode character.  */
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
index 04d2738..b043d68 100644
--- a/lib/uniwbrk/u-wordbreaks.h
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -69,31 +69,41 @@ FUNC (const UNIT *s, size_t n, char *p)
 
                       secondlast          last             current
 
-                       ALetter   (MidLetter | MidNumLet) × ALetter      (WB7)
-                       ALetter × (MidLetter | MidNumLet)   ALetter      (WB6)
-                       Numeric   (MidNum | MidNumLet)    × Numeric      (WB11)
-                       Numeric × (MidNum | MidNumLet)      Numeric      (WB12)
-                                                 ALetter × ALetter      (WB5)
-                                                 ALetter × Numeric      (WB9)
-                                                 Numeric × ALetter      (WB10)
+    (ALetter | HL)   (MidLetter | MidNumLet | SQ) × (ALetter | HL)      (WB7)
+    (ALetter | HL) × (MidLetter | MidNumLet | SQ)   (ALetter | HL)      (WB6)
+                  Numeric   (MidNum | MidNumLet | SQ)    × Numeric      (WB11)
+                  Numeric × (MidNum | MidNumLet | SQ)      Numeric      (WB12)
+                                                        HL × DQ HL      (WB7b)
+                                                        HL DQ × HL      (WB7c)
+                                   (ALetter | HL) × (ALetter | HL)      (WB5)
+                                          (ALetter | HL) × Numeric      (WB9)
+                                          Numeric × (ALetter | HL)      (WB10)
                                                  Numeric × Numeric      (WB8)
+                                                      HL × SQ           (WB7a)
                                                 Katakana × Katakana     (WB13)
-                          (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
+                     (ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a)
                                             ExtendNumLet × ExtendNumLet (WB13a)
-                         ExtendNumLet × (ALetter | Numeric | Katakana)  (WB13b)
+                    ExtendNumLet × (ALetter | HL | Numeric | Katakana)  (WB13b)
                                Regional_Indicator × Regional_Indicator  (WB13c)
                    */
                   /* No break across certain punctuation.  Also, disable word
                      breaks that were recognized earlier (due to lookahead of
                      only one complex character).  */
-                  if ((prop == WBP_ALETTER
+                  if (((prop == WBP_ALETTER
+                        || prop == WBP_HL)
                        && (last_compchar_prop == WBP_MIDLETTER
-                           || last_compchar_prop == WBP_MIDNUMLET)
-                       && secondlast_compchar_prop == WBP_ALETTER)
+                           || last_compchar_prop == WBP_MIDNUMLET
+                           || last_compchar_prop == WBP_SQ)
+                       && (secondlast_compchar_prop == WBP_ALETTER
+                           || secondlast_compchar_prop == WBP_HL))
                       || (prop == WBP_NUMERIC
                           && (last_compchar_prop == WBP_MIDNUM
-                              || last_compchar_prop == WBP_MIDNUMLET)
-                          && secondlast_compchar_prop == WBP_NUMERIC))
+                              || last_compchar_prop == WBP_MIDNUMLET
+                              || last_compchar_prop == WBP_SQ)
+                          && secondlast_compchar_prop == WBP_NUMERIC)
+                      || (prop == WBP_HL
+                          && last_compchar_prop == WBP_DQ
+                          && secondlast_compchar_prop == WBP_HL))
                     {
                       *last_compchar_ptr = 0;
                       /* *p = 0; */
diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c
index 04bd0e5..baeed58 100644
--- a/lib/uniwbrk/wbrktable.c
+++ b/lib/uniwbrk/wbrktable.c
@@ -22,31 +22,35 @@
 
 /* This table contains the following rules (see UAX #29):
 
-                       last         current
-
-                         ALetter × ALetter                         (WB5)
-                         ALetter × Numeric                         (WB9)
-                         Numeric × ALetter                         (WB10)
-                         Numeric × Numeric                         (WB8)
-                        Katakana × Katakana                        (WB13)
-  (ALetter | Numeric | Katakana) × ExtendNumLet                    (WB13a)
-                    ExtendNumLet × ExtendNumLet                    (WB13a)
-                    ExtendNumLet × (ALetter | Numeric | Katakana)  (WB13b)
-              Regional_Indicator × Regional_Indicator              (WB13c)
+                           last         current
+
+                     (ALetter | HL) × (ALetter | HL)                  (WB5)
+                     (ALetter | HL) × Numeric                         (WB9)
+                                 HL × SQ                              (WB7a)
+                            Numeric × (ALetter | HL)                  (WB10)
+                            Numeric × Numeric                         (WB8)
+                           Katakana × Katakana                        (WB13)
+(ALetter | HL | Numeric | Katakana) × ExtendNumLet                    (WB13a)
+                       ExtendNumLet × ExtendNumLet                    (WB13a)
+                   ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b)
+                 Regional_Indicator × Regional_Indicator              (WB13c)
  */
 
-const unsigned char uniwbrk_table[9][9] =
-{        /* current:      OTHER            MIDNUMLET         NUMERIC         */
-         /*                   KATAKANA           MIDLETTER      EXTENDNUMLET */
-         /*                          ALETTER            MIDNUM           RI  */
+const unsigned char uniwbrk_table[12][12] =
+{        /* current:      OTHER        MIDNUMLET    NUMERIC     DQ         */
+         /*                 KATAKANA     MIDLETTER    EXTENDNUMLET  SQ     */
+         /*                   ALETTER      MIDNUM           RI          HL */
   /* last */
-  /* WBP_OTHER */        {  1,    1,    1,    1,    1,    1,    1,    1,    1 
},
-  /* WBP_KATAKANA */     {  1,    0,    1,    1,    1,    1,    1,    0,    1 
},
-  /* WBP_ALETTER */      {  1,    1,    0,    1,    1,    1,    0,    0,    1 
},
-  /* WBP_MIDNUMLET */    {  1,    1,    1,    1,    1,    1,    1,    1,    1 
},
-  /* WBP_MIDLETTER */    {  1,    1,    1,    1,    1,    1,    1,    1,    1 
},
-  /* WBP_MIDNUM */       {  1,    1,    1,    1,    1,    1,    1,    1,    1 
},
-  /* WBP_NUMERIC */      {  1,    1,    0,    1,    1,    1,    0,    0,    1 
},
-  /* WBP_EXTENDNUMLET */ {  1,    0,    0,    1,    1,    1,    0,    0,    1 
},
-  /* WBP_RI */           {  1,    1,    1,    1,    1,    1,    1,    1,    0 }
+  /* WBP_OTHER */        {  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1 },
+  /* WBP_KATAKANA */     {  1,  0,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1 },
+  /* WBP_ALETTER */      {  1,  1,  0,  1,  1,  1,  0,  0,  1,  1,  1,  0 },
+  /* WBP_MIDNUMLET */    {  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1 },
+  /* WBP_MIDLETTER */    {  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1 },
+  /* WBP_MIDNUM */       {  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1 },
+  /* WBP_NUMERIC */      {  1,  1,  0,  1,  1,  1,  0,  0,  1,  1,  1,  0 },
+  /* WBP_EXTENDNUMLET */ {  1,  0,  0,  1,  1,  1,  0,  0,  1,  1,  1,  0 },
+  /* WBP_RI */           {  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1 },
+  /* WBP_DQ */           {  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1 },
+  /* WBP_SQ */           {  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1 },
+  /* WBP_HL */           {  1,  1,  0,  1,  1,  1,  0,  0,  1,  1,  0,  0 }
 };
diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h
index 50b7823..567a031 100644
--- a/lib/uniwbrk/wbrktable.h
+++ b/lib/uniwbrk/wbrktable.h
@@ -15,4 +15,4 @@
    You should have received a copy of the GNU Lesser General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
 
-extern const unsigned char uniwbrk_table[9][9];
+extern const unsigned char uniwbrk_table[12][12];
diff --git a/tests/uniwbrk/test-uc-wordbreaks.c 
b/tests/uniwbrk/test-uc-wordbreaks.c
index 41585f7..fc9dd59 100644
--- a/tests/uniwbrk/test-uc-wordbreaks.c
+++ b/tests/uniwbrk/test-uc-wordbreaks.c
@@ -48,6 +48,9 @@ wordbreakproperty_to_string (int wbp)
       CASE(NUMERIC)
       CASE(EXTENDNUMLET)
       CASE(RI)
+      CASE(DQ)
+      CASE(SQ)
+      CASE(HL)
     }
   abort ();
 }
-- 
1.9.3

[PATCH v2 09/10] Update to Unicode 6.3.0

Reply via email to