Changeset: fccfa2ae186e for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fccfa2ae186e
Modified Files:
        monetdb5/modules/mal/pcre.c
        
sql/test/BugTracker-2018/Tests/ilike-foreign-characters.Bug-6633.stable.out
Branch: Aug2018
Log Message:

Don't use strcasecmp and strcasestr since they work byte-for-byte.
This fixes bug 6633.


diffs (269 lines):

diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c
--- a/monetdb5/modules/mal/pcre.c
+++ b/monetdb5/modules/mal/pcre.c
@@ -24,6 +24,9 @@
 #include "mal.h"
 #include "mal_exception.h"
 
+#include <wchar.h>
+#include <wctype.h>
+
 #ifdef HAVE_LIBPCRE
 #include <pcre.h>
 #ifndef PCRE_STUDY_JIT_COMPILE
@@ -87,21 +90,183 @@ typedef struct RE {
        struct RE *n;
 } RE;
 
-#ifndef HAVE_STRCASESTR
+/* we cannot use strcasecmp and strncasecmp since they work byte for
+ * byte and don't deal with multibyte encodings (such as UTF-8) */
+
+#ifdef _MSC_VER
+/* on Windows, we cannot set the UTF-8 locale, so we need to implement
+ * our own version of mbrtowc and mbstowcs */
+
+static size_t
+my_mbrtowc(wchar_t *dst, const char *src, size_t len)
+{
+       if (len == 0)
+               return (size_t) -1;
+       if ((src[0] & 0x80) == 0) {
+               *dst = src[0];
+               return 1;
+       }
+       if (len == 1)
+               return (size_t) -1;
+       if ((src[0] & 0xE0) == 0xC0) {
+               *dst = ((src[0] & 0x1F) << 6) | (src[1] & 0x3F);
+               return 2;
+       }
+       if (len == 2)
+               return (size_t) -1;
+       if ((src[0] & 0xF0) == 0xE0) {
+               *dst = ((src[0] & 0x0F) << 12) | ((src[1] & 0x3F) << 6) | 
(src[2] & 0x3F);
+               return 3;
+       }
+       if (len == 3)
+               return (size_t) -1;
+       if ((src[0] & 0xF8) == 0xF0) {
+               *dst = ((src[0] & 0x0F) << 18) | ((src[1] & 0x3F) << 12) | 
((src[2] & 0x3F) << 6) | (src[3] & 0x3F);
+               return 4;
+       }
+       return (size_t) -1;
+}
+
+static size_t
+my_mbstowcs(wchar_t *dst, const char *src, size_t len)
+{
+       size_t i;
+
+       for (i = 0; i < len; i++) {
+               if ((src[0] & 0x80) == 0) {
+                       *dst = src[0];
+                       src += 1;
+               } else if ((src[0] & 0xE0) == 0xC0) {
+                       *dst = ((src[0] & 0x1F) << 6) | (src[1] & 0x3F);
+                       src += 2;
+               } else if ((src[0] & 0xF0) == 0xE0) {
+                       *dst = ((src[0] & 0x0F) << 12) | ((src[1] & 0x3F) << 6) 
| (src[2] & 0x3F);
+                       src += 3;
+               } else if ((src[0] & 0xF8) == 0xF0) {
+                       *dst = ((src[0] & 0x0F) << 18) | ((src[1] & 0x3F) << 
12) | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F);
+                       src += 4;
+               } else {
+                       return (size_t) -1;
+               }
+               if (*dst == 0)
+                       return i;
+               dst++;
+       }
+       return i;
+}
+#define mbrtowc(dst, src, len, ps)     my_mbrtowc(dst, src, len)
+#define mbstowcs(dst, src, len)        my_mbstowcs(dst, src, len)
+#endif
+
+static int
+mystrncasecmp(const char *s1, const char *s2, size_t n1)
+{
+       wchar_t c1, c2;
+       size_t n2 = n1;
+
+#ifndef _MSC_VER
+       mbstate_t ps1, ps2;
+       memset(&ps1, 0, sizeof(ps1));
+       memset(&ps2, 0, sizeof(ps2));
+#endif
+       while (n1 > 0 && n2 > 0) {
+               size_t nn1 = mbrtowc(&c1, s1, n1, &ps1);
+               size_t nn2 = mbrtowc(&c2, s2, n2, &ps2);
+               if (nn1 == 0)
+                       return -(nn2 != 0);
+               if (nn2 == 0)
+                       return 1;
+               if (nn1 == (size_t) -1 || nn1 == (size_t) -2 ||
+                       nn2 == (size_t) -1 || nn2 == (size_t) -2)
+                       return 0;        /* actually an error that shouldn't 
happen */
+               if (towlower((wint_t) c1) != towlower((wint_t) c2))
+                       return towlower((wint_t) c1) - towlower((wint_t) c2);
+               n1 -= nn1;
+               s1 += nn1;
+               n2 -= nn2;
+               s2 += nn2;
+       }
+       return 0;
+}
+
+static int
+mystrcasecmp(const char *s1, const char *s2)
+{
+       wchar_t c1, c2;
+
+#ifndef _MSC_VER
+       mbstate_t ps1, ps2;
+       memset(&ps1, 0, sizeof(ps1));
+       memset(&ps2, 0, sizeof(ps2));
+#endif
+       for (;;) {
+               /* use some ridiculously high number as the length of the
+                * input strings: we will still not go beyond the terminating
+                * '\0' */
+               size_t nn1 = mbrtowc(&c1, s1, 1000, &ps1);
+               size_t nn2 = mbrtowc(&c2, s2, 1000, &ps2);
+               if (nn1 == 0)
+                       return -(nn2 != 0);
+               if (nn2 == 0)
+                       return 1;
+               if (nn1 == (size_t) -1 || nn1 == (size_t) -2 ||
+                       nn2 == (size_t) -1 || nn2 == (size_t) -2)
+                       return 0;        /* actually an error that shouldn't 
happen */
+               if (towlower((wint_t) c1) != towlower((wint_t) c2))
+                       return towlower((wint_t) c1) - towlower((wint_t) c2);
+               s1 += nn1;
+               s2 += nn2;
+       }
+}
+
 static const char *
-strcasestr(const char *haystack, const char *needle)
+mystrcasestr(const char *haystack, const char *needle)
 {
        size_t nlen = strlen(needle);
 
        if (nlen == 0)
                return haystack;
-       for (size_t hlen = strlen(haystack); nlen <= hlen; haystack++, hlen--) {
-               if (strncasecmp(haystack, needle, nlen) == 0)
+       wchar_t *wneedle = GDKmalloc((nlen + 1) * sizeof(wchar_t));
+       if (wneedle == NULL || (nlen = mbstowcs(wneedle, needle, nlen + 1)) == 
(size_t) -1) {
+               GDKfree(wneedle);
+               nlen = strlen(needle);
+               /* fallback code */
+               for (size_t hlen = strlen(haystack); nlen <= hlen; hlen--) {
+                       if (mystrncasecmp(haystack, needle, nlen) == 0)
+                               return haystack;
+                       while ((*++haystack & 0xC0) == 0x80)
+                               hlen--;
+               }
+               return NULL;
+       }
+       for (wchar_t *w = wneedle; *w; w++)
+               *w = (wchar_t) towlower((wint_t) *w);
+#ifndef _MSC_VER
+       mbstate_t ps;
+       memset(&ps, 0, sizeof(ps));
+#endif
+       for (size_t hlen = strlen(haystack); *haystack; hlen--) {
+               size_t i;
+               for (i = 0; i < nlen; i++) {
+                       wchar_t c;
+                       size_t j = mbrtowc(&c, haystack, hlen, &ps);
+                       if (j == 0) {
+                               GDKfree(wneedle);
+                               return NULL;
+                       }
+                       if (towlower((wint_t) c) != (wint_t) wneedle[i])
+                               break;
+               }
+               if (i == nlen) {
+                       GDKfree(wneedle);
                        return haystack;
+               }
+               while ((*++haystack & 0xC0) == 0x80)
+                       hlen--;
        }
+       GDKfree(wneedle);
        return NULL;
 }
-#endif
 
 static int
 re_simple(const char *pat)
@@ -138,7 +303,7 @@ re_match_ignore(const char *s, RE *patte
 
        for (r = pattern; r; r = r->n) {
                if (!*s ||
-                       (r->search ? (s = strcasestr(s, r->k)) == NULL : 
strncasecmp(s, r->k, r->len) != 0))
+                       (r->search ? (s = mystrcasestr(s, r->k)) == NULL : 
mystrncasecmp(s, r->k, r->len) != 0))
                        return false;
                s += r->len;
        }
@@ -439,10 +604,10 @@ re_likeselect(BAT **bnp, BAT *b, BAT *s,
                        if (caseignore) {
                                if (anti)
                                        candscanloop(v && *v != '\200' &&
-                                                                strcasecmp(v, 
pat) != 0);
+                                                                
mystrcasecmp(v, pat) != 0);
                                else
                                        candscanloop(v && *v != '\200' &&
-                                                                strcasecmp(v, 
pat) == 0);
+                                                                
mystrcasecmp(v, pat) == 0);
                        } else {
                                if (anti)
                                        candscanloop(v && *v != '\200' &&
@@ -485,10 +650,10 @@ re_likeselect(BAT **bnp, BAT *b, BAT *s,
                        if (caseignore) {
                                if (anti)
                                        scanloop(v && *v != '\200' &&
-                                                        strcasecmp(v, pat) != 
0);
+                                                        mystrcasecmp(v, pat) 
!= 0);
                                else
                                        scanloop(v && *v != '\200' &&
-                                                        strcasecmp(v, pat) == 
0);
+                                                        mystrcasecmp(v, pat) 
== 0);
                        } else {
                                if (anti)
                                        scanloop(v && *v != '\200' &&
@@ -1284,7 +1449,7 @@ PCRElike4(bit *ret, const str *s, const 
                if (strcmp(ppat, str_nil) == 0) {
                        *ret = FALSE;
                        if (*isens) {
-                               if (strcasecmp(*s, *pat) == 0)
+                               if (mystrcasecmp(*s, *pat) == 0)
                                        *ret = TRUE;
                        } else {
                                if (strcmp(*s, *pat) == 0)
diff --git 
a/sql/test/BugTracker-2018/Tests/ilike-foreign-characters.Bug-6633.stable.out 
b/sql/test/BugTracker-2018/Tests/ilike-foreign-characters.Bug-6633.stable.out
--- 
a/sql/test/BugTracker-2018/Tests/ilike-foreign-characters.Bug-6633.stable.out
+++ 
b/sql/test/BugTracker-2018/Tests/ilike-foreign-characters.Bug-6633.stable.out
@@ -16,9 +16,6 @@ stdout of test 'ilike-foreign-characters
 # Listening for connection requests on 
mapi:monetdb://methuselah.da.cwi.nl:38893/
 # Listening for UNIX domain connection requests on 
mapi:monetdb:///var/tmp/mtest-20516/.s.monetdb.38893
 # MonetDB/GIS module loaded
-# MonetDB/SQL module loaded
-
-Ready.
 # SQL catalog created, loading sql scripts once
 # loading sql script: 09_like.sql
 # loading sql script: 10_math.sql
@@ -60,6 +57,9 @@ Ready.
 # loading sql script: 90_generator.sql
 # loading sql script: 90_generator_hge.sql
 # loading sql script: 99_system.sql
+# MonetDB/SQL module loaded
+
+Ready.
 
 # 10:10:04 >  
 # 10:10:04 >  "mclient" "-lsql" "-ftest" "-tnone" "-Eutf-8" "-i" "-e" 
"--host=/var/tmp/mtest-20516" "--port=38893"
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to