Changeset: cbbebe3d2875 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=cbbebe3d2875
Modified Files:
        monetdb5/modules/atoms/str.c
Branch: Jul2017
Log Message:

Updated case conversion tables to Unicode 10.0; various other code improvements.


diffs (truncated from 3559 to 300 lines):

diff --git a/monetdb5/modules/atoms/str.c b/monetdb5/modules/atoms/str.c
--- a/monetdb5/modules/atoms/str.c
+++ b/monetdb5/modules/atoms/str.c
@@ -96,51 +96,10 @@
  * high-performance hash-lookup (all code inlined).
  */
 
-/* This table was generated from the Unicode 5.0.0 spec. The table is
- * generated by using the codes for conversion to lower case and for
- * conversion to title case and upper case. A few code points have
- * been moved in order to get reasonable conversions (if two code
- * points are converted to the same value, the first one in this table
- * wins).  The code points that have been interchanged are:
- * U+0345 (COMBINING GREEK YPOGEGRAMMENI) / U+03B9 (GREEK SMALL LETTER IOTA) 
<-> U+0399 (GREEK CAPITAL LETTER IOTA)
- * U+00B5 (MICRO SIGN) / U+03BC (GREEK SMALL LETTER MU) <-> U+039C (GREEK 
CAPITAL LETTER MU)
- * U+03C2 (GREEK SMALL LETTER FINAL SIGMA) / U+03C3 (GREEK SMALL LETTER SIGMA) 
<-> U+3A3 (GREEK CAPITAL LETTER SIGMA)
- *
- * In addition, there are a few code points where there are different
- * versions for upper case and title case.  These had to be switched
- * around a little so that the mappings are done sensibly.
- *
- * The following combinations are included in this order:
- * lower case <-> title case
- * lower case <-  upper case
- * upper case  -> title case
- * The conversion title case -> upper case was removed
- *
- * The relevant code points are:
- * U+01C4 (LATIN CAPITAL LETTER DZ WITH CARON)
- * U+01C5 (LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON)
- * U+01C6 (LATIN SMALL LETTER DZ WITH CARON)
- * U+01C7 (LATIN CAPITAL LETTER LJ)
- * U+01C8 (LATIN CAPITAL LETTER L WITH SMALL LETTER J)
- * U+01C9 (LATIN SMALL LETTER LJ)
- * U+01CA (LATIN CAPITAL LETTER NJ)
- * U+01CB (LATIN CAPITAL LETTER N WITH SMALL LETTER J)
- * U+01CC (LATIN SMALL LETTER NJ)
- * U+01F1 (LATIN CAPITAL LETTER DZ)
- * U+01F2 (LATIN CAPITAL LETTER D WITH SMALL LETTER Z)
- * U+01F3 (LATIN SMALL LETTER DZ)
- *
- * The script used was basically:
-(cut -d\; -f1,14 UnicodeData.txt | sed -n 's/\(.*\);\(..*\)/\2;\1/p'
- cut -d\; -f1,15 UnicodeData.txt | grep -v ';$'
- cut -d\; -f1,13 UnicodeData.txt | grep -v ';$'
-) | grep -v '^\([^ ]*\);\1$' | sort -t\; -u | sed 
's/\(.*\);\(.*\)/{0x\1,0x\2,},/'
- * with some hand munging afterward.  The data file is UnicodeData.txt
- * from http://www.unicode.org/.
- */
+/* These tables were generated from the Unicode 10.0.0 spec. */
 struct UTF8_lower_upper {
-       unsigned int lower, upper;
-} UTF8_lower_upper[] = {
+       unsigned int from, to;
+} UTF8_toUpper[] = { /* code points with non-null uppercase conversion */
        { 0x0061, 0x0041, },
        { 0x0062, 0x0042, },
        { 0x0063, 0x0043, },
@@ -150,10 +109,8 @@ struct UTF8_lower_upper {
        { 0x0067, 0x0047, },
        { 0x0068, 0x0048, },
        { 0x0069, 0x0049, },
-       { 0x0069, 0x0130, },
        { 0x006A, 0x004A, },
        { 0x006B, 0x004B, },
-       { 0x006B, 0x212A, },
        { 0x006C, 0x004C, },
        { 0x006D, 0x004D, },
        { 0x006E, 0x004E, },
@@ -169,14 +126,13 @@ struct UTF8_lower_upper {
        { 0x0078, 0x0058, },
        { 0x0079, 0x0059, },
        { 0x007A, 0x005A, },
-       { 0x03BC, 0x039C, },
+       { 0x00B5, 0x039C, },
        { 0x00E0, 0x00C0, },
        { 0x00E1, 0x00C1, },
        { 0x00E2, 0x00C2, },
        { 0x00E3, 0x00C3, },
        { 0x00E4, 0x00C4, },
        { 0x00E5, 0x00C5, },
-       { 0x00E5, 0x212B, },
        { 0x00E6, 0x00C6, },
        { 0x00E7, 0x00C7, },
        { 0x00E8, 0x00C8, },
@@ -286,15 +242,12 @@ struct UTF8_lower_upper {
        { 0x01B9, 0x01B8, },
        { 0x01BD, 0x01BC, },
        { 0x01BF, 0x01F7, },
-       { 0x01C6, 0x01C5, },
+       { 0x01C5, 0x01C4, },
        { 0x01C6, 0x01C4, },
-       { 0x01C4, 0x01C5, },
-       { 0x01C9, 0x01C8, },
+       { 0x01C8, 0x01C7, },
        { 0x01C9, 0x01C7, },
-       { 0x01C7, 0x01C8, },
-       { 0x01CC, 0x01CB, },
+       { 0x01CB, 0x01CA, },
        { 0x01CC, 0x01CA, },
-       { 0x01CA, 0x01CB, },
        { 0x01CE, 0x01CD, },
        { 0x01D0, 0x01CF, },
        { 0x01D2, 0x01D1, },
@@ -313,9 +266,8 @@ struct UTF8_lower_upper {
        { 0x01EB, 0x01EA, },
        { 0x01ED, 0x01EC, },
        { 0x01EF, 0x01EE, },
-       { 0x01F3, 0x01F2, },
+       { 0x01F2, 0x01F1, },
        { 0x01F3, 0x01F1, },
-       { 0x01F1, 0x01F2, },
        { 0x01F5, 0x01F4, },
        { 0x01F9, 0x01F8, },
        { 0x01FB, 0x01FA, },
@@ -347,36 +299,54 @@ struct UTF8_lower_upper {
        { 0x0231, 0x0230, },
        { 0x0233, 0x0232, },
        { 0x023C, 0x023B, },
+       { 0x023F, 0x2C7E, },
+       { 0x0240, 0x2C7F, },
        { 0x0242, 0x0241, },
        { 0x0247, 0x0246, },
        { 0x0249, 0x0248, },
        { 0x024B, 0x024A, },
        { 0x024D, 0x024C, },
        { 0x024F, 0x024E, },
+       { 0x0250, 0x2C6F, },
+       { 0x0251, 0x2C6D, },
+       { 0x0252, 0x2C70, },
        { 0x0253, 0x0181, },
        { 0x0254, 0x0186, },
        { 0x0256, 0x0189, },
        { 0x0257, 0x018A, },
        { 0x0259, 0x018F, },
        { 0x025B, 0x0190, },
+       { 0x025C, 0xA7AB, },
        { 0x0260, 0x0193, },
+       { 0x0261, 0xA7AC, },
        { 0x0263, 0x0194, },
+       { 0x0265, 0xA78D, },
+       { 0x0266, 0xA7AA, },
        { 0x0268, 0x0197, },
        { 0x0269, 0x0196, },
+       { 0x026A, 0xA7AE, },
        { 0x026B, 0x2C62, },
+       { 0x026C, 0xA7AD, },
        { 0x026F, 0x019C, },
+       { 0x0271, 0x2C6E, },
        { 0x0272, 0x019D, },
        { 0x0275, 0x019F, },
        { 0x027D, 0x2C64, },
        { 0x0280, 0x01A6, },
        { 0x0283, 0x01A9, },
+       { 0x0287, 0xA7B1, },
        { 0x0288, 0x01AE, },
        { 0x0289, 0x0244, },
        { 0x028A, 0x01B1, },
        { 0x028B, 0x01B2, },
        { 0x028C, 0x0245, },
        { 0x0292, 0x01B7, },
-       { 0x03B9, 0x0399, },
+       { 0x029D, 0xA7B2, },
+       { 0x029E, 0xA7B0, },
+       { 0x0345, 0x0399, },
+       { 0x0371, 0x0370, },
+       { 0x0373, 0x0372, },
+       { 0x0377, 0x0376, },
        { 0x037B, 0x03FD, },
        { 0x037C, 0x03FE, },
        { 0x037D, 0x03FF, },
@@ -392,25 +362,23 @@ struct UTF8_lower_upper {
        { 0x03B6, 0x0396, },
        { 0x03B7, 0x0397, },
        { 0x03B8, 0x0398, },
-       { 0x03B8, 0x03F4, },
-       { 0x0345, 0x0399, },
+       { 0x03B9, 0x0399, },
        { 0x03BA, 0x039A, },
        { 0x03BB, 0x039B, },
-       { 0x00B5, 0x039C, },
+       { 0x03BC, 0x039C, },
        { 0x03BD, 0x039D, },
        { 0x03BE, 0x039E, },
        { 0x03BF, 0x039F, },
        { 0x03C0, 0x03A0, },
        { 0x03C1, 0x03A1, },
+       { 0x03C2, 0x03A3, },
        { 0x03C3, 0x03A3, },
-       { 0x03C2, 0x03A3, },
        { 0x03C4, 0x03A4, },
        { 0x03C5, 0x03A5, },
        { 0x03C6, 0x03A6, },
        { 0x03C7, 0x03A7, },
        { 0x03C8, 0x03A8, },
        { 0x03C9, 0x03A9, },
-       { 0x03C9, 0x2126, },
        { 0x03CA, 0x03AA, },
        { 0x03CB, 0x03AB, },
        { 0x03CC, 0x038C, },
@@ -420,6 +388,7 @@ struct UTF8_lower_upper {
        { 0x03D1, 0x0398, },
        { 0x03D5, 0x03A6, },
        { 0x03D6, 0x03A0, },
+       { 0x03D7, 0x03CF, },
        { 0x03D9, 0x03D8, },
        { 0x03DB, 0x03DA, },
        { 0x03DD, 0x03DC, },
@@ -435,6 +404,7 @@ struct UTF8_lower_upper {
        { 0x03F0, 0x039A, },
        { 0x03F1, 0x03A1, },
        { 0x03F2, 0x03F9, },
+       { 0x03F3, 0x037F, },
        { 0x03F5, 0x0395, },
        { 0x03F8, 0x03F7, },
        { 0x03FB, 0x03FA, },
@@ -572,6 +542,20 @@ struct UTF8_lower_upper {
        { 0x050F, 0x050E, },
        { 0x0511, 0x0510, },
        { 0x0513, 0x0512, },
+       { 0x0515, 0x0514, },
+       { 0x0517, 0x0516, },
+       { 0x0519, 0x0518, },
+       { 0x051B, 0x051A, },
+       { 0x051D, 0x051C, },
+       { 0x051F, 0x051E, },
+       { 0x0521, 0x0520, },
+       { 0x0523, 0x0522, },
+       { 0x0525, 0x0524, },
+       { 0x0527, 0x0526, },
+       { 0x0529, 0x0528, },
+       { 0x052B, 0x052A, },
+       { 0x052D, 0x052C, },
+       { 0x052F, 0x052E, },
        { 0x0561, 0x0531, },
        { 0x0562, 0x0532, },
        { 0x0563, 0x0533, },
@@ -610,6 +594,22 @@ struct UTF8_lower_upper {
        { 0x0584, 0x0554, },
        { 0x0585, 0x0555, },
        { 0x0586, 0x0556, },
+       { 0x13F8, 0x13F0, },
+       { 0x13F9, 0x13F1, },
+       { 0x13FA, 0x13F2, },
+       { 0x13FB, 0x13F3, },
+       { 0x13FC, 0x13F4, },
+       { 0x13FD, 0x13F5, },
+       { 0x1C80, 0x0412, },
+       { 0x1C81, 0x0414, },
+       { 0x1C82, 0x041E, },
+       { 0x1C83, 0x0421, },
+       { 0x1C84, 0x0422, },
+       { 0x1C85, 0x0422, },
+       { 0x1C86, 0x042A, },
+       { 0x1C87, 0x0462, },
+       { 0x1C88, 0xA64A, },
+       { 0x1D79, 0xA77D, },
        { 0x1D7D, 0x2C63, },
        { 0x1E01, 0x1E00, },
        { 0x1E03, 0x1E02, },
@@ -732,6 +732,9 @@ struct UTF8_lower_upper {
        { 0x1EF5, 0x1EF4, },
        { 0x1EF7, 0x1EF6, },
        { 0x1EF9, 0x1EF8, },
+       { 0x1EFB, 0x1EFA, },
+       { 0x1EFD, 0x1EFC, },
+       { 0x1EFF, 0x1EFE, },
        { 0x1F00, 0x1F08, },
        { 0x1F01, 0x1F09, },
        { 0x1F02, 0x1F0A, },
@@ -926,6 +929,7 @@ struct UTF8_lower_upper {
        { 0x2C68, 0x2C67, },
        { 0x2C6A, 0x2C69, },
        { 0x2C6C, 0x2C6B, },
+       { 0x2C73, 0x2C72, },
        { 0x2C76, 0x2C75, },
        { 0x2C81, 0x2C80, },
        { 0x2C83, 0x2C82, },
@@ -977,6 +981,9 @@ struct UTF8_lower_upper {
        { 0x2CDF, 0x2CDE, },
        { 0x2CE1, 0x2CE0, },
        { 0x2CE3, 0x2CE2, },
+       { 0x2CEC, 0x2CEB, },
+       { 0x2CEE, 0x2CED, },
+       { 0x2CF3, 0x2CF2, },
        { 0x2D00, 0x10A0, },
        { 0x2D01, 0x10A1, },
        { 0x2D02, 0x10A2, },
@@ -1015,6 +1022,186 @@ struct UTF8_lower_upper {
        { 0x2D23, 0x10C3, },
        { 0x2D24, 0x10C4, },
        { 0x2D25, 0x10C5, },
+       { 0x2D27, 0x10C7, },
+       { 0x2D2D, 0x10CD, },
+       { 0xA641, 0xA640, },
+       { 0xA643, 0xA642, },
+       { 0xA645, 0xA644, },
+       { 0xA647, 0xA646, },
+       { 0xA649, 0xA648, },
+       { 0xA64B, 0xA64A, },
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to