Changeset: 6616adb34787 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/6616adb34787 Modified Files: clients/Tests/MAL-signatures-hge.test clients/Tests/MAL-signatures.test clients/Tests/exports.stable.out gdk/gdk.h gdk/gdk_string.c monetdb5/modules/atoms/str.c monetdb5/modules/kernel/batstr.c monetdb5/modules/mal/pcre.c Branch: ascii-flag Log Message:
Implemented case folding, and use it for ILIKE. diffs (truncated from 2394 to 300 lines): diff --git a/clients/Tests/MAL-signatures-hge.test b/clients/Tests/MAL-signatures-hge.test --- a/clients/Tests/MAL-signatures-hge.test +++ b/clients/Tests/MAL-signatures-hge.test @@ -34709,6 +34709,16 @@ pattern batstr.asciify(X_0:bat[:str], X_ BATSTRasciify; Transform BAT of strings from UTF8 to ASCII batstr +caseFold +pattern batstr.caseFold(X_0:bat[:str]):bat[:str] +STRbatCaseFold; +Fold the case of a string. +batstr +caseFold +pattern batstr.caseFold(X_0:bat[:str], X_1:bat[:oid]):bat[:str] +STRbatCaseFold; +Fold the case of a string. +batstr contains pattern batstr.contains(X_0:bat[:str], X_1:bat[:str]):bat[:bit] BATSTRcontains; @@ -50684,6 +50694,11 @@ command str.asciify(X_0:str):str STRasciify; Transform string from UTF8 to ASCII str +caseFold +command str.caseFold(X_0:str):str +STRcasefold; +Fold the case of a string. +str contains pattern str.contains(X_0:str, X_1:str):bit STRcontains; diff --git a/clients/Tests/MAL-signatures.test b/clients/Tests/MAL-signatures.test --- a/clients/Tests/MAL-signatures.test +++ b/clients/Tests/MAL-signatures.test @@ -25759,6 +25759,16 @@ pattern batstr.asciify(X_0:bat[:str], X_ BATSTRasciify; Transform BAT of strings from UTF8 to ASCII batstr +caseFold +pattern batstr.caseFold(X_0:bat[:str]):bat[:str] +STRbatCaseFold; +Fold the case of a string. +batstr +caseFold +pattern batstr.caseFold(X_0:bat[:str], X_1:bat[:oid]):bat[:str] +STRbatCaseFold; +Fold the case of a string. +batstr contains pattern batstr.contains(X_0:bat[:str], X_1:bat[:str]):bat[:bit] BATSTRcontains; @@ -39009,6 +39019,11 @@ command str.asciify(X_0:str):str STRasciify; Transform string from UTF8 to ASCII str +caseFold +command str.caseFold(X_0:str):str +STRcasefold; +Fold the case of a string. +str contains pattern str.contains(X_0:str, X_1:str):bit STRcontains; diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -109,6 +109,7 @@ dbl BATcalcvariance_population(dbl *avgp dbl BATcalcvariance_sample(dbl *avgp, BAT *b); BAT *BATcalcxor(BAT *b1, BAT *b2, BAT *s1, BAT *s2); BAT *BATcalcxorcst(BAT *b, const ValRecord *v, BAT *s); +BAT *BATcasefold(BAT *b, BAT *s); bool BATcheckorderidx(BAT *b); gdk_return BATclear(BAT *b, bool force); void BATcommit(BAT *b, BUN size); @@ -271,6 +272,7 @@ gdk_return GDKanalyticalsum(BAT *r, BAT gdk_return GDKanalyticalwindowbounds(BAT *r, BAT *b, BAT *p, BAT *l, const void *restrict bound, int tp1, int tp2, int unit, bool preceding, oid first_half); gdk_return GDKasciify(char **restrict buf, size_t *restrict buflen, const char *restrict s); int GDKatomcnt; +gdk_return GDKcasefold(char **restrict buf, size_t *restrict buflen, const char *restrict s); void GDKclrerr(void); gdk_return GDKcopyenv(BAT **key, BAT **val, bool writable); gdk_return GDKcreatedir(const char *nme); diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -2343,11 +2343,13 @@ gdk_export gdk_return BATfirstn(BAT **to gdk_export gdk_return GDKtoupper(char **restrict buf, size_t *restrict buflen, const char *restrict s); gdk_export gdk_return GDKtolower(char **restrict buf, size_t *restrict buflen, const char *restrict s); +gdk_export gdk_return GDKcasefold(char **restrict buf, size_t *restrict buflen, const char *restrict s); gdk_export int GDKstrncasecmp(const char *str1, const char *str2, size_t l1, size_t l2); gdk_export int GDKstrcasecmp(const char *s1, const char *s2); gdk_export char *GDKstrcasestr(const char *haystack, const char *needle); +gdk_export BAT *BATtoupper(BAT *b, BAT *s); gdk_export BAT *BATtolower(BAT *b, BAT *s); -gdk_export BAT *BATtoupper(BAT *b, BAT *s); +gdk_export BAT *BATcasefold(BAT *b, BAT *s); gdk_export gdk_return GDKasciify(char **restrict buf, size_t *restrict buflen, const char *restrict s); gdk_export BAT *BATasciify(BAT *b, BAT *s); diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c --- a/gdk/gdk_string.c +++ b/gdk/gdk_string.c @@ -1473,8 +1473,80 @@ GDKanalytical_str_group_concat(BAT *r, B * a (new) offset into the same table. */ static const char *const specialcase[] = { NULL, + "ss", + "i\xCC\x87", + "\xCA\xBCn", + "j\xCC\x8C", + "\xCE\xB9\xCC\x88\xCC\x81", + "\xCF\x85\xCC\x88\xCC\x81", + "\xD5\xA5\xD6\x82", + "h\xCC\xB1", + "t\xCC\x88", + "w\xCC\x8A", + "y\xCC\x8A", + "a\xCA\xBE", + "\xCF\x85\xCC\x93", + "\xCF\x85\xCC\x93\xCC\x80", + "\xCF\x85\xCC\x93\xCC\x81", + "\xCF\x85\xCC\x93\xCD\x82", + "\xE1\xBC\x80\xCE\xB9", + "\xE1\xBC\x81\xCE\xB9", + "\xE1\xBC\x82\xCE\xB9", + "\xE1\xBC\x83\xCE\xB9", + "\xE1\xBC\x84\xCE\xB9", + "\xE1\xBC\x85\xCE\xB9", + "\xE1\xBC\x86\xCE\xB9", + "\xE1\xBC\x87\xCE\xB9", + "\xE1\xBC\xA0\xCE\xB9", + "\xE1\xBC\xA1\xCE\xB9", + "\xE1\xBC\xA2\xCE\xB9", + "\xE1\xBC\xA3\xCE\xB9", + "\xE1\xBC\xA4\xCE\xB9", + "\xE1\xBC\xA5\xCE\xB9", + "\xE1\xBC\xA6\xCE\xB9", + "\xE1\xBC\xA7\xCE\xB9", + "\xE1\xBD\xA0\xCE\xB9", + "\xE1\xBD\xA1\xCE\xB9", + "\xE1\xBD\xA2\xCE\xB9", + "\xE1\xBD\xA3\xCE\xB9", + "\xE1\xBD\xA4\xCE\xB9", + "\xE1\xBD\xA5\xCE\xB9", + "\xE1\xBD\xA6\xCE\xB9", + "\xE1\xBD\xA7\xCE\xB9", + "\xE1\xBD\xB0\xCE\xB9", + "\xCE\xB1\xCE\xB9", + "\xCE\xAC\xCE\xB9", + "\xCE\xB1\xCD\x82", + "\xCE\xB1\xCD\x82\xCE\xB9", + "\xE1\xBD\xB4\xCE\xB9", + "\xCE\xB7\xCE\xB9", + "\xCE\xAE\xCE\xB9", + "\xCE\xB7\xCD\x82", + "\xCE\xB7\xCD\x82\xCE\xB9", + "\xCE\xB9\xCC\x88\xCC\x80", + "\xCE\xB9\xCD\x82", + "\xCE\xB9\xCC\x88\xCD\x82", + "\xCF\x85\xCC\x88\xCC\x80", + "\xCF\x81\xCC\x93", + "\xCF\x85\xCD\x82", + "\xCF\x85\xCC\x88\xCD\x82", + "\xE1\xBD\xBC\xCE\xB9", + "\xCF\x89\xCE\xB9", + "\xCF\x8E\xCE\xB9", + "\xCF\x89\xCD\x82", + "\xCF\x89\xCD\x82\xCE\xB9", + "ff", + "fi", + "fl", + "ffi", + "ffl", + "st", + "\xD5\xB4\xD5\xB6", + "\xD5\xB4\xD5\xA5", + "\xD5\xB4\xD5\xAB", + "\xD5\xBE\xD5\xB6", + "\xD5\xB4\xD5\xAD", "SS", - "i\xCC\x87", "FF", "FI", "FL", @@ -1548,6 +1620,38 @@ static const char *const specialcase[] = "\xCE\xA9\xCD\x82\xCE\x99", }; static const int lowercase[4288] = { + [0x00] = 0x0000, /* U+0000: <control> */ + [0x01] = 0x0001, /* U+0001: <control> */ + [0x02] = 0x0002, /* U+0002: <control> */ + [0x03] = 0x0003, /* U+0003: <control> */ + [0x04] = 0x0004, /* U+0004: <control> */ + [0x05] = 0x0005, /* U+0005: <control> */ + [0x06] = 0x0006, /* U+0006: <control> */ + [0x07] = 0x0007, /* U+0007: <control> */ + [0x08] = 0x0008, /* U+0008: <control> */ + [0x09] = 0x0009, /* U+0009: <control> */ + [0x0A] = 0x000A, /* U+000A: <control> */ + [0x0B] = 0x000B, /* U+000B: <control> */ + [0x0C] = 0x000C, /* U+000C: <control> */ + [0x0D] = 0x000D, /* U+000D: <control> */ + [0x0E] = 0x000E, /* U+000E: <control> */ + [0x0F] = 0x000F, /* U+000F: <control> */ + [0x10] = 0x0010, /* U+0010: <control> */ + [0x11] = 0x0011, /* U+0011: <control> */ + [0x12] = 0x0012, /* U+0012: <control> */ + [0x13] = 0x0013, /* U+0013: <control> */ + [0x14] = 0x0014, /* U+0014: <control> */ + [0x15] = 0x0015, /* U+0015: <control> */ + [0x16] = 0x0016, /* U+0016: <control> */ + [0x17] = 0x0017, /* U+0017: <control> */ + [0x18] = 0x0018, /* U+0018: <control> */ + [0x19] = 0x0019, /* U+0019: <control> */ + [0x1A] = 0x001A, /* U+001A: <control> */ + [0x1B] = 0x001B, /* U+001B: <control> */ + [0x1C] = 0x001C, /* U+001C: <control> */ + [0x1D] = 0x001D, /* U+001D: <control> */ + [0x1E] = 0x001E, /* U+001E: <control> */ + [0x1F] = 0x001F, /* U+001F: <control> */ [0x20] = 0x0020, /* U+0020: SPACE */ [0x21] = 0x0021, /* U+0021: EXCLAMATION MARK */ [0x22] = 0x0022, /* U+0022: QUOTATION MARK */ @@ -1643,6 +1747,7 @@ static const int lowercase[4288] = { [0x7C] = 0x007C, /* U+007C: VERTICAL LINE */ [0x7D] = 0x007D, /* U+007D: RIGHT CURLY BRACKET */ [0x7E] = 0x007E, /* U+007E: TILDE */ + [0x7F] = 0x007F, /* U+007F: <control> */ [0xC3] = 256 - 0x80, /* 303 ... */ [256+0x00] = 0x00E0, /* U+00C0: LATIN CAPITAL LETTER A WITH GRAVE */ [256+0x01] = 0x00E1, /* U+00C1: LATIN CAPITAL LETTER A WITH ACUTE */ @@ -3115,6 +3220,38 @@ static const int lowercase[4288] = { [4224+0x21] = 0x1E943, /* U+1E921: ADLAM CAPITAL LETTER SHA */ }; static const int uppercase[4608] = { + [0x00] = 0x0000, /* U+0000: <control> */ + [0x01] = 0x0001, /* U+0001: <control> */ + [0x02] = 0x0002, /* U+0002: <control> */ + [0x03] = 0x0003, /* U+0003: <control> */ + [0x04] = 0x0004, /* U+0004: <control> */ + [0x05] = 0x0005, /* U+0005: <control> */ + [0x06] = 0x0006, /* U+0006: <control> */ + [0x07] = 0x0007, /* U+0007: <control> */ + [0x08] = 0x0008, /* U+0008: <control> */ + [0x09] = 0x0009, /* U+0009: <control> */ + [0x0A] = 0x000A, /* U+000A: <control> */ + [0x0B] = 0x000B, /* U+000B: <control> */ + [0x0C] = 0x000C, /* U+000C: <control> */ + [0x0D] = 0x000D, /* U+000D: <control> */ + [0x0E] = 0x000E, /* U+000E: <control> */ + [0x0F] = 0x000F, /* U+000F: <control> */ + [0x10] = 0x0010, /* U+0010: <control> */ + [0x11] = 0x0011, /* U+0011: <control> */ + [0x12] = 0x0012, /* U+0012: <control> */ + [0x13] = 0x0013, /* U+0013: <control> */ + [0x14] = 0x0014, /* U+0014: <control> */ + [0x15] = 0x0015, /* U+0015: <control> */ + [0x16] = 0x0016, /* U+0016: <control> */ + [0x17] = 0x0017, /* U+0017: <control> */ + [0x18] = 0x0018, /* U+0018: <control> */ + [0x19] = 0x0019, /* U+0019: <control> */ + [0x1A] = 0x001A, /* U+001A: <control> */ + [0x1B] = 0x001B, /* U+001B: <control> */ + [0x1C] = 0x001C, /* U+001C: <control> */ + [0x1D] = 0x001D, /* U+001D: <control> */ + [0x1E] = 0x001E, /* U+001E: <control> */ + [0x1F] = 0x001F, /* U+001F: <control> */ [0x20] = 0x0020, /* U+0020: SPACE */ [0x21] = 0x0021, /* U+0021: EXCLAMATION MARK */ [0x22] = 0x0022, /* U+0022: QUOTATION MARK */ @@ -3210,10 +3347,11 @@ static const int uppercase[4608] = { [0x7C] = 0x007C, /* U+007C: VERTICAL LINE */ [0x7D] = 0x007D, /* U+007D: RIGHT CURLY BRACKET */ [0x7E] = 0x007E, /* U+007E: TILDE */ + [0x7F] = 0x007F, /* U+007F: <control> */ [0xC2] = 256 - 0x80, /* 302 ... */ [256+0x35] = 0x039C, /* U+00B5: MICRO SIGN */ [0xC3] = 320 - 0x80, /* 303 ... */ - [320+0x1F] = -1, /* U+00DF: LATIN SMALL LETTER SHARP S */ + [320+0x1F] = -74, /* U+00DF: LATIN SMALL LETTER SHARP S */ [320+0x20] = 0x00C0, /* U+00E0: LATIN SMALL LETTER A WITH GRAVE */ [320+0x21] = 0x00C1, /* U+00E1: LATIN SMALL LETTER A WITH ACUTE */ [320+0x22] = 0x00C2, /* U+00E2: LATIN SMALL LETTER A WITH CIRCUMFLEX */ @@ -3283,7 +3421,7 @@ static const int uppercase[4608] = { [448+0x04] = 0x0143, /* U+0144: LATIN SMALL LETTER N WITH ACUTE */ [448+0x06] = 0x0145, /* U+0146: LATIN SMALL LETTER N WITH CEDILLA */ [448+0x08] = 0x0147, /* U+0148: LATIN SMALL LETTER N WITH CARON */ - [448+0x09] = -15, /* U+0149: LATIN SMALL LETTER N PRECEDED BY APOSTROPHE */ + [448+0x09] = -87, /* U+0149: LATIN SMALL LETTER N PRECEDED BY APOSTROPHE */ [448+0x0B] = 0x014A, /* U+014B: LATIN SMALL LETTER ENG */ [448+0x0D] = 0x014C, /* U+014D: LATIN SMALL LETTER O WITH MACRON */ [448+0x0F] = 0x014E, /* U+014F: LATIN SMALL LETTER O WITH BREVE */ @@ -3358,7 +3496,7 @@ static const int uppercase[4608] = { [576+0x2B] = 0x01EA, /* U+01EB: LATIN SMALL LETTER O WITH OGONEK */ [576+0x2D] = 0x01EC, /* U+01ED: LATIN SMALL LETTER O WITH OGONEK AND MACRON */ [576+0x2F] = 0x01EE, /* U+01EF: LATIN SMALL LETTER EZH WITH CARON */ - [576+0x30] = -18, /* U+01F0: LATIN SMALL LETTER J WITH CARON */ + [576+0x30] = -90, /* U+01F0: LATIN SMALL LETTER J WITH CARON */ _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org