* lib/gen-uni-tables.c (output_joining_group): Switch to 3-level table to accommodate a joining group defined with higher code-point value. Since there are only 88 groups defined in Unicode 7.0.0, use 7-bit packed format for level3 entries. * lib/unictype/joininggroup_of.c (uc_joining_group): Adjust to use 3-level table. * lib/unictype/joininggroup_of.h: Regenerate. --- lib/gen-uni-tables.c | 155 ++++++++++++++++++++++++++++++----------- lib/unictype/joininggroup_of.c | 29 ++++++-- 2 files changed, 139 insertions(+), 45 deletions(-)
diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index 7a273fc..1af832e 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -3987,7 +3987,7 @@ output_joining_type (const char *filename, const char *version) } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); - fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Arabic joining group of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); @@ -4213,11 +4213,22 @@ output_joining_group_test (const char *filename, const char *version) } } +/* Construction of sparse 3-level tables. */ +#define TABLE joining_group_table +#define ELEMENT uint8_t +#define DEFAULT UC_JOINING_GROUP_NONE +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + static void output_joining_group (const char *filename, const char *version) { FILE *stream; - unsigned int ch_min, ch_max, ch, i; + unsigned int ch, i; + struct joining_group_table t; + unsigned int level1_offset, level2_offset, level3_offset; + uint16_t *level3_packed; stream = fopen (filename, "w"); if (stream == NULL) @@ -4231,51 +4242,115 @@ output_joining_group (const char *filename, const char *version) fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); - ch_min = 0x10FFFF; + t.p = 7; + t.q = 9; + joining_group_table_init (&t); + for (ch = 0; ch < 0x110000; ch++) - if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE) - { - ch_min = ch; - break; - } + { + uint8_t value = unicode_joining_group[ch]; - ch_max = 0; - for (ch = 0x10FFFF; ch > 0; ch--) - if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE) - { - ch_max = ch; - break; - } + if (value > 0x7f) + abort (); - if (!(ch_min <= ch_max)) - abort (); + joining_group_table_add (&t, ch, value); + } - /* If the interval [ch_min, ch_max] is too large, we should better use a - 3-level table. */ - if (!(ch_max - ch_min < 0x200)) - abort (); + joining_group_table_finalize (&t); - fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min); - fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x] =\n", - ch_max + 1, ch_min); - fprintf (stream, "{"); - for (i = 0; i <= ch_max - ch_min; i++) - { - const char *s; + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); - ch = ch_min + i; - if ((i % 2) == 0) - fprintf (stream, "\n "); - s = joining_group_as_c_identifier (unicode_joining_group[ch]); - fprintf (stream, " %s", s); - if (i+1 <= ch_max - ch_min) - { - fprintf (stream, ","); - if (((i+1) % 2) != 0) - fprintf (stream, "%*s", 38 - (int) strlen (s), ""); - } + for (i = 0; i < 5; i++) + fprintf (stream, "#define joining_group_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size, + (1 << t.p) * 7 / 16); + fprintf (stream, " }\n"); + fprintf (stream, "u_joining_group =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); } - fprintf (stream, "\n"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units, + not 32-bit units, in order to make the lookup function easier. */ + level3_packed = + (uint16_t *) + calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t)); + for (i = 0; i < t.level3_size << t.p; i++) + { + unsigned int j = (i * 7) / 16; + unsigned int k = (i * 7) % 16; + uint32_t value = ((unsigned char *) (t.result + level3_offset))[i]; + value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k); + level3_packed[j] = value & 0xffff; + level3_packed[j+1] = value >> 16; + } + fprintf (stream, " {"); + if ((t.level3_size << t.p) * 7 / 16 + 1 > 8) + fprintf (stream, "\n "); + for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%04x", level3_packed[i]); + if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1) + fprintf (stream, ","); + } + if ((t.level3_size << t.p) * 7 / 16 + 1 > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + free (level3_packed); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) diff --git a/lib/unictype/joininggroup_of.c b/lib/unictype/joininggroup_of.c index 987af1e..c7b6846 100644 --- a/lib/unictype/joininggroup_of.c +++ b/lib/unictype/joininggroup_of.c @@ -20,14 +20,33 @@ /* Specification. */ #include "unictype.h" -#include "unictype/joininggroup_of.h" +/* Define u_joining_group table. */ +#include "joininggroup_of.h" int uc_joining_group (ucs4_t uc) { - if (uc >= joining_group_header_0 - && uc < joining_group_header_0 - + sizeof (u_joining_group) / sizeof (u_joining_group[0])) - return u_joining_group[uc - joining_group_header_0]; + unsigned int index1 = uc >> joining_group_header_0; + if (index1 < joining_group_header_1) + { + int lookup1 = u_joining_group.level1[index1]; + if (lookup1 >= 0) + { + unsigned int index2 = (uc >> joining_group_header_2) & joining_group_header_3; + int lookup2 = u_joining_group.level2[lookup1 + index2]; + if (lookup2 >= 0) + { + unsigned int index3 = ((uc & joining_group_header_4) + lookup2) * 7; + /* level3 contains 7-bit values, packed into 16-bit words. */ + unsigned int lookup3 = + ((u_joining_group.level3[index3>>4] + | (u_joining_group.level3[(index3>>4)+1] << 16)) + >> (index3 % 16)) + & 0x7f; + + return lookup3; + } + } + } return UC_JOINING_GROUP_NONE; } -- 1.9.3