The uniname module tries to reduce memory usage by grouping Unicode codepoints so that each codepoint fit into a 16-bit integer (4-bit tag plus 12-bit index). However, this transformation cannot represent all the characters in Unicode 7.0, where > 16 groups are needed.
The attached patch tries to remove the limitation. It basically switches the hard-coded transformation rule into a binary search on a table of contiguous codepoint ranges. * lib/uniname/gen-uninames.lisp (unicode-char): Rename CODE member to INDEX, as it no longer represents a code-point. (range): New struct. (main): Switch to intervals list from a bit-pattern based classification. --- lib/uniname/gen-uninames.lisp | 86 +++++++++--------- lib/uniname/uniname.c | 203 +++++++++++++++++++++++++----------------- 2 files changed, 164 insertions(+), 125 deletions(-) diff --git a/lib/uniname/gen-uninames.lisp b/lib/uniname/gen-uninames.lisp index d08e93f..e7de0a1 100755 --- a/lib/uniname/gen-uninames.lisp +++ b/lib/uniname/gen-uninames.lisp @@ -6,12 +6,18 @@ (defparameter add-comments nil) (defstruct unicode-char - (code nil :type integer) + (index nil :type integer) (name nil :type string) word-indices word-indices-index ) +(defstruct range + (index nil :type integer) + (start-code nil :type integer) + (end-code nil :type integer) +) + (defstruct word-list (hashed nil :type hash-table) (sorted nil :type list) @@ -22,7 +28,10 @@ (defun main (inputfile outputfile) (declare (type string inputfile outputfile)) #+UNICODE (setq *default-file-encoding* charset:utf-8) - (let ((all-chars '())) + (let ((all-chars '()) + (all-ranges '()) + (name-index 0) + range) ;; Read all characters and names from the input file. (with-open-file (istream inputfile :direction :input) (loop @@ -41,40 +50,26 @@ ; specially as well. (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A) (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D)) - ; Transform the code so that it fits in 16 bits. In - ; Unicode 5.1 the following ranges are used. - ; 0x00000..0x04DFF >>12= 0x00..0x04 -> 0x0..0x4 - ; 0x0A000..0x0AAFF >>12= 0x0A -> 0x5 - ; 0x0F900..0x0FFFF >>12= 0x0F -> 0x6 - ; 0x10000..0x10A58 >>12= 0x10 -> 0x7 - ; 0x12000..0x12473 >>12= 0x12 -> 0x8 - ; 0x1D000..0x1D7FF >>12= 0x1D -> 0x9 - ; 0x1F000..0x1F093 >>12= 0x1F -> 0xA - ; 0x2F800..0x2FAFF >>12= 0x2F -> 0xB - ; 0xE0000..0xE00FF >>12= 0xE0 -> 0xC - (flet ((transform (x) - (dpb - (case (ash x -12) - ((#x00 #x01 #x02 #x03 #x04) (ash x -12)) - (#x0A 5) - (#x0F 6) - (#x10 7) - (#x12 8) - (#x1D 9) - (#x1F #xA) - (#x2F #xB) - (#xE0 #xC) - (t (error "Update the transform function for 0x~5,'0X" x)) - ) - (byte 8 12) - x - )) ) - (push (make-unicode-char :code (transform code) - :name name-string) - all-chars - ) ) ) ) ) + (push (make-unicode-char :index name-index + :name name-string) + all-chars) + ;; Update the contiguous range, or start a new range. + (if (and range (= (1+ (range-end-code range)) code)) + (setf (range-end-code range) code) + (progn + (when range + (push range all-ranges)) + (setq range (make-range :index name-index + :start-code code + :end-code code)))) + (incf name-index) + (setq last-code code) + ) ) ) ) ) ) ) (setq all-chars (nreverse all-chars)) + (if range + (push range all-ranges)) + (setq all-ranges (nreverse all-ranges)) ;; Split into words. (let ((words-by-length (make-array 0 :adjustable t))) (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars))) @@ -257,14 +252,14 @@ (incf i (length (unicode-char-word-indices uc))) ) ) (format ostream "};~%") - (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%") + (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%") (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%") - (format ostream "unicode_name_to_code[~D] = {~%" + (format ostream "unicode_name_to_index[~D] = {~%" (length all-chars) ) (dolist (uc all-chars) (format ostream " { 0x~4,'0X, ~D }," - (unicode-char-code uc) + (unicode-char-index uc) (unicode-char-word-indices-index uc) ) (when add-comments @@ -273,14 +268,14 @@ (format ostream "~%") ) (format ostream "};~%") - (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%") + (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%") (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%") - (format ostream "unicode_code_to_name[~D] = {~%" + (format ostream "unicode_index_to_name[~D] = {~%" (length all-chars) ) - (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code)) + (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-index)) (format ostream " { 0x~4,'0X, ~D }," - (unicode-char-code uc) + (unicode-char-index uc) (unicode-char-word-indices-index uc) ) (when add-comments @@ -295,6 +290,15 @@ (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%" (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars)) ) + (format ostream "static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%" + (length all-ranges)) + (dolist (range all-ranges) + (format ostream " { ~D, ~D, ~D },~%" + (range-index range) + (- (range-start-code range) (range-index range)) + (1+ (- (range-end-code range) (range-start-code range)))) + ) + (format ostream "};~%") ) ) ) ) diff --git a/lib/uniname/uniname.c b/lib/uniname/uniname.c index 8f4b32d..276a1cb 100644 --- a/lib/uniname/uniname.c +++ b/lib/uniname/uniname.c @@ -45,10 +45,11 @@ #define UNICODE_CHARNAME_WORD_CJK 417 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107 static const uint16_t unicode_names[68940] = ...; - static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...; - static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...; + static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...; + static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...; #define UNICODE_CHARNAME_MAX_LENGTH 83 #define UNICODE_CHARNAME_MAX_WORDS 13 + static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...; */ /* Returns the word with a given index. */ @@ -127,6 +128,82 @@ unicode_name_word_lookup (const char *word, unsigned int length) return -1; } +#define UNINAME_INVALID_INDEX UINT16_MAX + +/* Looks up the internal index of a Unicode character. */ +static uint16_t +unicode_code_to_index (ucs4_t c) +{ + /* Binary search in unicode_ranges. */ + unsigned int i1 = 0; + unsigned int i2 = SIZEOF (unicode_ranges); + + for (;;) + { + unsigned int i = (i1 + i2) >> 1; + ucs4_t start_code = + unicode_ranges[i].index + unicode_ranges[i].gap; + ucs4_t end_code = + start_code + unicode_ranges[i].length - 1; + + if (start_code <= c && c <= end_code) + return c - unicode_ranges[i].gap; + + if (end_code < c) + { + if (i1 == i) + break; + /* Note here: i1 < i < i2. */ + i1 = i; + } + else if (c < start_code) + { + if (i2 == i) + break; + /* Note here: i1 <= i < i2. */ + i2 = i; + } + } + return UNINAME_INVALID_INDEX; +} + +/* Looks up the codepoint of a Unicode character, from the given + internal index. */ +static ucs4_t +unicode_index_to_code (uint16_t index) +{ + /* Binary search in unicode_ranges. */ + unsigned int i1 = 0; + unsigned int i2 = SIZEOF (unicode_ranges); + + for (;;) + { + unsigned int i = (i1 + i2) >> 1; + uint16_t start_index = unicode_ranges[i].index; + uint16_t end_index = start_index + unicode_ranges[i].length - 1; + + if (start_index <= index && index <= end_index) + return index + unicode_ranges[i].gap; + + if (end_index < index) + { + if (i1 == i) + break; + /* Note here: i1 < i < i2. */ + i1 = i; + } + else if (index < start_index) + { + if (i2 == i) + break; + /* Note here: i1 <= i < i2. */ + i2 = i; + } + } + return UNINAME_INVALID; +} + + /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, sections 3.11 and 4.4. */ static const char jamo_initial_short_name[19][3] = @@ -203,78 +280,47 @@ unicode_character_name (ucs4_t c, char *buf) } else { - const uint16_t *words; + uint16_t index = unicode_code_to_index (c); + const uint16_t *words = NULL; - /* Transform the code so that it fits in 16 bits. */ - switch (c >> 12) + if (index != UNINAME_INVALID_INDEX) { - case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: - break; - case 0x0A: - c -= 0x05000; - break; - case 0x0F: - c -= 0x09000; - break; - case 0x10: - c -= 0x09000; - break; - case 0x12: - c -= 0x0A000; - break; - case 0x1D: - c -= 0x14000; - break; - case 0x1F: - c -= 0x15000; - break; - case 0x2F: - c -= 0x24000; - break; - case 0xE0: - c -= 0xD4000; - break; - default: - return NULL; + /* Binary search in unicode_code_to_name. */ + unsigned int i1 = 0; + unsigned int i2 = SIZEOF (unicode_index_to_name); + for (;;) + { + unsigned int i = (i1 + i2) >> 1; + if (unicode_index_to_name[i].index == index) + { + words = &unicode_names[unicode_index_to_name[i].name]; + break; + } + else if (unicode_index_to_name[i].index < index) + { + if (i1 == i) + { + words = NULL; + break; + } + /* Note here: i1 < i < i2. */ + i1 = i; + } + else if (unicode_index_to_name[i].index > index) + { + if (i2 == i) + { + words = NULL; + break; + } + /* Note here: i1 <= i < i2. */ + i2 = i; + } + } } - - { - /* Binary search in unicode_code_to_name. */ - unsigned int i1 = 0; - unsigned int i2 = SIZEOF (unicode_code_to_name); - for (;;) - { - unsigned int i = (i1 + i2) >> 1; - if (unicode_code_to_name[i].code == c) - { - words = &unicode_names[unicode_code_to_name[i].name]; - break; - } - else if (unicode_code_to_name[i].code < c) - { - if (i1 == i) - { - words = NULL; - break; - } - /* Note here: i1 < i < i2. */ - i1 = i; - } - else if (unicode_code_to_name[i].code > c) - { - if (i2 == i) - { - words = NULL; - break; - } - /* Note here: i1 <= i < i2. */ - i2 = i; - } - } - } if (words != NULL) { - /* Found it in unicode_code_to_name. Now concatenate the words. */ + /* Found it in unicode_index_to_name. Now concatenate the words. */ /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */ char *ptr = buf; for (;;) @@ -463,15 +509,15 @@ unicode_name_character (const char *name) for (; --i >= 0; ) words[i] = 2 * words[i] + 1; } - /* Binary search in unicode_name_to_code. */ + /* Binary search in unicode_name_to_index. */ { unsigned int i1 = 0; - unsigned int i2 = SIZEOF (unicode_name_to_code); + unsigned int i2 = SIZEOF (unicode_name_to_index); for (;;) { unsigned int i = (i1 + i2) >> 1; const uint16_t *w = words; - const uint16_t *p = &unicode_names[unicode_name_to_code[i].name]; + const uint16_t *p = &unicode_names[unicode_name_to_index[i].name]; unsigned int n = words_length; for (;;) { @@ -493,18 +539,7 @@ unicode_name_character (const char *name) } p++; w++; n--; if (n == 0) - { - unsigned int c = unicode_name_to_code[i].code; - - /* Undo the transformation to 16-bit space. */ - static const unsigned int offset[13] = - { - 0x00000, 0x00000, 0x00000, 0x00000, 0x00000, - 0x05000, 0x09000, 0x09000, 0x0A000, 0x14000, - 0x15000, 0x24000, 0xD4000 - }; - return c + offset[c >> 12]; - } + return unicode_index_to_code (unicode_name_to_index[i].index); } } } -- 1.9.3