[PATCH v2 06/10] uniname: Make codepoint transformation more flexible

Daiki Ueno Thu, 23 Oct 2014 01:02:57 -0700

The uniname module tries to reduce memory usage by grouping Unicode
codepoints so that each codepoint fit into a 16-bit integer (4-bit tag
plus 12-bit index).  However, this transformation cannot represent all
the characters in Unicode 7.0, where > 16 groups are needed.


The attached patch tries to remove the limitation.  It basically
switches the hard-coded transformation rule into a binary search on a
table of contiguous codepoint ranges.

* lib/uniname/gen-uninames.lisp (unicode-char): Rename CODE member
to INDEX, as it no longer represents a code-point.
(range): New struct.
(main): Switch to intervals list from a bit-pattern based
classification.
---
 lib/uniname/gen-uninames.lisp |  86 +++++++++---------
 lib/uniname/uniname.c         | 203 +++++++++++++++++++++++++-----------------
 2 files changed, 164 insertions(+), 125 deletions(-)

diff --git a/lib/uniname/gen-uninames.lisp b/lib/uniname/gen-uninames.lisp
index d08e93f..e7de0a1 100755
--- a/lib/uniname/gen-uninames.lisp
+++ b/lib/uniname/gen-uninames.lisp
@@ -6,12 +6,18 @@
 (defparameter add-comments nil)
 
 (defstruct unicode-char
-  (code nil :type integer)
+  (index nil :type integer)
   (name nil :type string)
   word-indices
   word-indices-index
 )
 
+(defstruct range
+  (index nil :type integer)
+  (start-code nil :type integer)
+  (end-code nil :type integer)
+)
+
 (defstruct word-list
   (hashed nil :type hash-table)
   (sorted nil :type list)
@@ -22,7 +28,10 @@
 (defun main (inputfile outputfile)
   (declare (type string inputfile outputfile))
   #+UNICODE (setq *default-file-encoding* charset:utf-8)
-  (let ((all-chars '()))
+  (let ((all-chars '())
+        (all-ranges '())
+        (name-index 0)
+        range)
     ;; Read all characters and names from the input file.
     (with-open-file (istream inputfile :direction :input)
       (loop
@@ -41,40 +50,26 @@
                 ; specially as well.
                 (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
                             (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
-                  ; Transform the code so that it fits in 16 bits. In
-                  ; Unicode 5.1 the following ranges are used.
-                  ;   0x00000..0x04DFF  >>12=  0x00..0x04  -> 0x0..0x4
-                  ;   0x0A000..0x0AAFF  >>12=  0x0A        -> 0x5
-                  ;   0x0F900..0x0FFFF  >>12=  0x0F        -> 0x6
-                  ;   0x10000..0x10A58  >>12=  0x10        -> 0x7
-                  ;   0x12000..0x12473  >>12=  0x12        -> 0x8
-                  ;   0x1D000..0x1D7FF  >>12=  0x1D        -> 0x9
-                  ;   0x1F000..0x1F093  >>12=  0x1F        -> 0xA
-                  ;   0x2F800..0x2FAFF  >>12=  0x2F        -> 0xB
-                  ;   0xE0000..0xE00FF  >>12=  0xE0        -> 0xC
-                  (flet ((transform (x)
-                           (dpb
-                             (case (ash x -12)
-                               ((#x00 #x01 #x02 #x03 #x04) (ash x -12))
-                               (#x0A 5)
-                               (#x0F 6)
-                               (#x10 7)
-                               (#x12 8)
-                               (#x1D 9)
-                               (#x1F #xA)
-                               (#x2F #xB)
-                               (#xE0 #xC)
-                               (t (error "Update the transform function for 
0x~5,'0X" x))
-                             )
-                             (byte 8 12)
-                             x
-                        )) )
-                    (push (make-unicode-char :code (transform code)
-                                             :name name-string)
-                          all-chars
-            ) ) ) ) )
+                  (push (make-unicode-char :index name-index
+                                           :name name-string)
+                        all-chars)
+                  ;; Update the contiguous range, or start a new range.
+                  (if (and range (= (1+ (range-end-code range)) code))
+                      (setf (range-end-code range) code)
+                    (progn
+                      (when range
+                        (push range all-ranges))
+                      (setq range (make-range :index name-index
+                                              :start-code code
+                                              :end-code code))))
+                  (incf name-index)
+                  (setq last-code code)
+                  ) ) )
     ) ) ) )
     (setq all-chars (nreverse all-chars))
+    (if range
+        (push range all-ranges))
+    (setq all-ranges (nreverse all-ranges))
     ;; Split into words.
     (let ((words-by-length (make-array 0 :adjustable t)))
       (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar 
#'unicode-char-name all-chars)))
@@ -257,14 +252,14 @@
             (incf i (length (unicode-char-word-indices uc)))
         ) )
         (format ostream "};~%")
-        (format ostream "static const struct { uint16_t code; uint32_t 
name:24; }~%")
+        (format ostream "static const struct { uint16_t index; uint32_t 
name:24; }~%")
         (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ 
>= 7)~%__attribute__((__packed__))~%#endif~%")
-        (format ostream "unicode_name_to_code[~D] = {~%"
+        (format ostream "unicode_name_to_index[~D] = {~%"
                         (length all-chars)
         )
         (dolist (uc all-chars)
           (format ostream "  { 0x~4,'0X, ~D },"
-                          (unicode-char-code uc)
+                          (unicode-char-index uc)
                           (unicode-char-word-indices-index uc)
           )
           (when add-comments
@@ -273,14 +268,14 @@
           (format ostream "~%")
         )
         (format ostream "};~%")
-        (format ostream "static const struct { uint16_t code; uint32_t 
name:24; }~%")
+        (format ostream "static const struct { uint16_t index; uint32_t 
name:24; }~%")
         (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ 
>= 7)~%__attribute__((__packed__))~%#endif~%")
-        (format ostream "unicode_code_to_name[~D] = {~%"
+        (format ostream "unicode_index_to_name[~D] = {~%"
                         (length all-chars)
         )
-        (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
+        (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-index))
           (format ostream "  { 0x~4,'0X, ~D },"
-                          (unicode-char-code uc)
+                          (unicode-char-index uc)
                           (unicode-char-word-indices-index uc)
           )
           (when add-comments
@@ -295,6 +290,15 @@
         (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
                         (reduce #'max (mapcar (lambda (uc) (length 
(unicode-char-word-indices uc))) all-chars))
         )
+        (format ostream "static const struct { uint16_t index; uint32_t gap; 
uint16_t length; } unicode_ranges[~D] = {~%"
+                        (length all-ranges))
+        (dolist (range all-ranges)
+          (format ostream "  { ~D, ~D, ~D },~%"
+                  (range-index range)
+                  (- (range-start-code range) (range-index range))
+                  (1+ (- (range-end-code range) (range-start-code range))))
+        )
+        (format ostream "};~%")
       )
 ) ) )
 
diff --git a/lib/uniname/uniname.c b/lib/uniname/uniname.c
index 8f4b32d..276a1cb 100644
--- a/lib/uniname/uniname.c
+++ b/lib/uniname/uniname.c
@@ -45,10 +45,11 @@
   #define UNICODE_CHARNAME_WORD_CJK 417
   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
   static const uint16_t unicode_names[68940] = ...;
-  static const struct { uint16_t code; uint32_t name:24; } 
unicode_name_to_code[16626] = ...;
-  static const struct { uint16_t code; uint32_t name:24; } 
unicode_code_to_name[16626] = ...;
+  static const struct { uint16_t index; uint32_t name:24; } 
unicode_name_to_index[16626] = ...;
+  static const struct { uint16_t index; uint32_t name:24; } 
unicode_index_to_name[16626] = ...;
   #define UNICODE_CHARNAME_MAX_LENGTH 83
   #define UNICODE_CHARNAME_MAX_WORDS 13
+  static const struct { uint32_t index; uint32_t gap; uint16_t length; } 
unicode_ranges[401] = ...;
 */
 
 /* Returns the word with a given index.  */
@@ -127,6 +128,82 @@ unicode_name_word_lookup (const char *word, unsigned int 
length)
   return -1;
 }
 
+#define UNINAME_INVALID_INDEX UINT16_MAX
+
+/* Looks up the internal index of a Unicode character.  */
+static uint16_t
+unicode_code_to_index (ucs4_t c)
+{
+  /* Binary search in unicode_ranges.  */
+  unsigned int i1 = 0;
+  unsigned int i2 = SIZEOF (unicode_ranges);
+
+  for (;;)
+    {
+      unsigned int i = (i1 + i2) >> 1;
+      ucs4_t start_code =
+        unicode_ranges[i].index + unicode_ranges[i].gap;
+      ucs4_t end_code =
+        start_code + unicode_ranges[i].length - 1;
+
+      if (start_code <= c && c <= end_code)
+        return c - unicode_ranges[i].gap;
+
+      if (end_code < c)
+        {
+          if (i1 == i)
+            break;
+          /* Note here: i1 < i < i2.  */
+          i1 = i;
+        }
+      else if (c < start_code)
+        {
+          if (i2 == i)
+            break;
+          /* Note here: i1 <= i < i2.  */
+          i2 = i;
+        }
+    }
+  return UNINAME_INVALID_INDEX;
+}
+
+/* Looks up the codepoint of a Unicode character, from the given
+   internal index.  */
+static ucs4_t
+unicode_index_to_code (uint16_t index)
+{
+  /* Binary search in unicode_ranges.  */
+  unsigned int i1 = 0;
+  unsigned int i2 = SIZEOF (unicode_ranges);
+
+  for (;;)
+    {
+      unsigned int i = (i1 + i2) >> 1;
+      uint16_t start_index = unicode_ranges[i].index;
+      uint16_t end_index = start_index + unicode_ranges[i].length - 1;
+
+      if (start_index <= index && index <= end_index)
+        return index + unicode_ranges[i].gap;
+
+      if (end_index < index)
+        {
+          if (i1 == i)
+            break;
+          /* Note here: i1 < i < i2.  */
+          i1 = i;
+        }
+      else if (index < start_index)
+        {
+          if (i2 == i)
+            break;
+          /* Note here: i1 <= i < i2.  */
+          i2 = i;
+        }
+    }
+  return UNINAME_INVALID;
+}
+
+
 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
    sections 3.11 and 4.4.  */
 static const char jamo_initial_short_name[19][3] =
@@ -203,78 +280,47 @@ unicode_character_name (ucs4_t c, char *buf)
     }
   else
     {
-      const uint16_t *words;
+      uint16_t index = unicode_code_to_index (c);
+      const uint16_t *words = NULL;
 
-      /* Transform the code so that it fits in 16 bits.  */
-      switch (c >> 12)
+      if (index != UNINAME_INVALID_INDEX)
         {
-        case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
-          break;
-        case 0x0A:
-          c -= 0x05000;
-          break;
-        case 0x0F:
-          c -= 0x09000;
-          break;
-        case 0x10:
-          c -= 0x09000;
-          break;
-        case 0x12:
-          c -= 0x0A000;
-          break;
-        case 0x1D:
-          c -= 0x14000;
-          break;
-        case 0x1F:
-          c -= 0x15000;
-          break;
-        case 0x2F:
-          c -= 0x24000;
-          break;
-        case 0xE0:
-          c -= 0xD4000;
-          break;
-        default:
-          return NULL;
+          /* Binary search in unicode_code_to_name.  */
+          unsigned int i1 = 0;
+          unsigned int i2 = SIZEOF (unicode_index_to_name);
+          for (;;)
+            {
+              unsigned int i = (i1 + i2) >> 1;
+              if (unicode_index_to_name[i].index == index)
+                {
+                  words = &unicode_names[unicode_index_to_name[i].name];
+                  break;
+                }
+              else if (unicode_index_to_name[i].index < index)
+                {
+                  if (i1 == i)
+                    {
+                      words = NULL;
+                      break;
+                    }
+                  /* Note here: i1 < i < i2.  */
+                  i1 = i;
+                }
+              else if (unicode_index_to_name[i].index > index)
+                {
+                  if (i2 == i)
+                    {
+                      words = NULL;
+                      break;
+                    }
+                  /* Note here: i1 <= i < i2.  */
+                  i2 = i;
+                }
+            }
         }
-
-      {
-        /* Binary search in unicode_code_to_name.  */
-        unsigned int i1 = 0;
-        unsigned int i2 = SIZEOF (unicode_code_to_name);
-        for (;;)
-          {
-            unsigned int i = (i1 + i2) >> 1;
-            if (unicode_code_to_name[i].code == c)
-              {
-                words = &unicode_names[unicode_code_to_name[i].name];
-                break;
-              }
-            else if (unicode_code_to_name[i].code < c)
-              {
-                if (i1 == i)
-                  {
-                    words = NULL;
-                    break;
-                  }
-                /* Note here: i1 < i < i2.  */
-                i1 = i;
-              }
-            else if (unicode_code_to_name[i].code > c)
-              {
-                if (i2 == i)
-                  {
-                    words = NULL;
-                    break;
-                  }
-                /* Note here: i1 <= i < i2.  */
-                i2 = i;
-              }
-          }
-      }
       if (words != NULL)
         {
-          /* Found it in unicode_code_to_name. Now concatenate the words.  */
+          /* Found it in unicode_index_to_name. Now concatenate the words.  */
           /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
           char *ptr = buf;
           for (;;)
@@ -463,15 +509,15 @@ unicode_name_character (const char *name)
                 for (; --i >= 0; )
                   words[i] = 2 * words[i] + 1;
               }
-              /* Binary search in unicode_name_to_code.  */
+              /* Binary search in unicode_name_to_index.  */
               {
                 unsigned int i1 = 0;
-                unsigned int i2 = SIZEOF (unicode_name_to_code);
+                unsigned int i2 = SIZEOF (unicode_name_to_index);
                 for (;;)
                   {
                     unsigned int i = (i1 + i2) >> 1;
                     const uint16_t *w = words;
-                    const uint16_t *p = 
&unicode_names[unicode_name_to_code[i].name];
+                    const uint16_t *p = 
&unicode_names[unicode_name_to_index[i].name];
                     unsigned int n = words_length;
                     for (;;)
                       {
@@ -493,18 +539,7 @@ unicode_name_character (const char *name)
                           }
                         p++; w++; n--;
                         if (n == 0)
-                          {
-                            unsigned int c = unicode_name_to_code[i].code;
-
-                            /* Undo the transformation to 16-bit space.  */
-                            static const unsigned int offset[13] =
-                              {
-                                0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
-                                0x05000, 0x09000, 0x09000, 0x0A000, 0x14000,
-                                0x15000, 0x24000, 0xD4000
-                              };
-                            return c + offset[c >> 12];
-                          }
+                          return unicode_index_to_code 
(unicode_name_to_index[i].index);
                       }
                   }
               }
-- 
1.9.3

[PATCH v2 06/10] uniname: Make codepoint transformation more flexible

Reply via email to