Changeset: e7382ec341ec for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/e7382ec341ec
Modified Files:
        monetdb5/modules/mal/pcre.c
Branch: ascii-flag
Log Message:

Implemented LIKE _ matching without PCRE.


diffs (truncated from 750 to 300 lines):

diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c
--- a/monetdb5/modules/mal/pcre.c
+++ b/monetdb5/modules/mal/pcre.c
@@ -53,40 +53,15 @@ typedef regex_t pcre;
 struct RE {
        char *k;
        bool search:1, atend:1, case_ignore:1;
-       size_t len;                                     /* number of bytes in 
string */
-       size_t ulen;                            /* number of codepoints in 
string */
+       size_t skip;                    /* number of codepoints to skip before 
matching */
+       size_t len;                             /* number of bytes in string */
+       size_t ulen;                    /* number of codepoints in string */
        struct RE *n;
 };
 
 /* We cannot use strcasecmp and strncasecmp since they work byte for
  * byte and don't deal with multibyte encodings (such as UTF-8). */
 
-/* returns true if the pattern does not contain unescaped `_' (single
- * character match) and ends with unescaped `%' (any sequence
- * match) */
-static inline bool
-re_simple(const char *pat, unsigned char esc)
-{
-       bool escaped = false;
-
-       if (pat == 0)
-               return false;
-       if (*pat == '%') {
-               pat++;
-       }
-       while (*pat) {
-               if (escaped) {
-                       escaped = false;
-               } else if ((unsigned char) *pat == esc) {
-                       escaped = true;
-               } else if (*pat == '_') {
-                       return false;
-               }
-               pat++;
-       }
-       return true;
-}
-
 static inline bool
 re_is_pattern_properly_escaped(const char *pat, unsigned char esc)
 {
@@ -124,14 +99,77 @@ re_match(const char *restrict s, const s
        const struct RE *r;
 
        for (r = pattern; r; r = r->n) {
-               if (*r->k == 0 && (r->search || *s == 0))
+               for (size_t i = 0; i < r->skip; s++) {
+                       if (*s == 0)
+                               return false;
+                       i += (*s & 0xC0) != 0x80;
+               }
+               if (r->search) {
+                       if (r->atend) {
+                               /* we're searching for a string at the end, so 
just skip
+                                * over everything and just compare with the 
tail of the
+                                * haystack */
+                               size_t slen = strlen(s);
+                               if (slen < r->ulen) {
+                                       /* remaining string too short: each 
codepoint
+                                        * requires at least one byte */
+                                       return false;
+                               }
+                               const char *e = s + slen;
+                               if (!r->case_ignore) {
+                                       if (slen < r->len) {
+                                               /* remaining string is too 
short to match */
+                                               return false;
+                                       }
+                                       e -= r->len;
+                                       if ((*e & 0xC0) == 0x80) {
+                                               /* not at start of a Unicode 
character, so
+                                                * cannot match (this test not 
strictly
+                                                * required: the strcmp should 
also return
+                                                * unequal) */
+                                               return false;
+                                       }
+                                       return strcmp(e, r->k) == 0;
+                               }
+                               size_t ulen = r->ulen;
+                               while (e > s && ulen != 0) {
+                                       ulen -= (*--e & 0xC0) != 0x80;
+                               }
+                               /* ulen != 0 means remaining string is too 
short */
+                               return ulen == 0 && GDKstrcasecmp(e, r->k) == 0;
+                       }
+                       /* in case we have a pattern consisting of % followed 
by _,
+                        * we need to backtrack, so use recursion; here we know 
we
+                        * have the %, look for an _ in the rest of the pattern
+                        * (note %_ and _% are equivalent and is taken care of 
by
+                        * the pattern construction in re_create) */
+                       for (const struct RE *p = r->n; p; p = p->n) {
+                               if (p->skip != 0) {
+                                       struct RE pat = *r;
+                                       pat.search = false;
+                                       pat.skip = 0;
+                                       do {
+                                               if (re_match(s, &pat))
+                                                       return true;
+                                               do
+                                                       s++;
+                                               while (*s && (*s & 0xC0) == 
0x80);
+                                       } while (*s != 0);
+                                       return false;
+                               }
+                       }
+               }
+               if (r->k[0] == 0 && (r->search || *s == 0))
                        return true;
                if (r->case_ignore) {
                        for (;;) {
+                               if (r->search && (s = GDKstrcasestr(s, r->k)) 
== NULL)
+                                       return false;
                                if (*s == '\0')
                                        return false;
                                /* in "atend" comparison, include NUL byte in 
the compare */
-                               if (GDKstrncasecmp(s, r->k, SIZE_MAX, r->len + 
r->atend) != 0) {
+                               if ((!r->search || r->atend) &&
+                                       GDKstrncasecmp(s, r->k, SIZE_MAX, 
r->len + r->atend) != 0) {
                                        /* no match */
                                        if (!r->search)
                                                return false;
@@ -147,14 +185,14 @@ re_match(const char *restrict s, const s
                                break;
                        }
                } else {
-                       /* search for first byte in pattern */
-                       if (r->search && (s = strchr(s, r->k[0])) == NULL)
-                               return false;
                        for (;;) {
+                               if (r->search && (s = strstr(s, r->k)) == NULL)
+                                       return false;
                                if (*s == '\0')
                                        return false;
                                /* in "atend" comparison, include NUL byte in 
the compare */
-                               if (strncmp(s, r->k, r->len + r->atend) != 0) {
+                               if ((!r->search || r->atend) &&
+                                       strncmp(s, r->k, r->len + r->atend) != 
0) {
                                        /* no match */
                                        if (!r->search)
                                                return false;
@@ -204,18 +242,26 @@ re_create(const char *pat, bool caseigno
 
        if (r == NULL)
                return NULL;
-       *r = (struct RE) {.atend = true };
+       *r = (struct RE) {
+               .atend = true,
+               .case_ignore = caseignore,
+       };
 
-       while (esc != '%' && *pat == '%') {
-               pat++;                                  /* skip % */
-               r->search = true;
+       for (;;) {
+               if (esc != '%' && *pat == '%') {
+                       pat++;                                  /* skip % */
+                       r->search = true;
+               } else if (esc != '_' && *pat == '_') {
+                       pat++;
+                       r->skip++;
+               } else {
+                       break;
+               }
        }
        if ((p = GDKstrdup(pat)) == NULL) {
                GDKfree(r);
                return NULL;
        }
-       if (caseignore)
-               n->case_ignore = true;
 
        r->k = p;
        q = p;
@@ -227,25 +273,34 @@ re_create(const char *pat, bool caseigno
                        escaped = false;
                } else if ((unsigned char) *p == esc) {
                        escaped = true;
-               } else if (*p == '%') {
+               } else if (*p == '%' || *p == '_') {
                        n->atend = false;
-                       while (p[1] == '%')
+                       bool search = false;
+                       size_t skip = 0;
+                       for (;;) {
+                               if (*p == '_')
+                                       skip++;
+                               else if (*p == '%')
+                                       search = true;
+                               else
+                                       break;
                                p++;
-                       if (p[1]) {
+                       }
+                       if (*p || skip != 0) {
                                n = n->n = GDKmalloc(sizeof(struct RE));
                                if (n == NULL)
                                        goto bailout;
                                *n = (struct RE) {
-                                       .search = true,
+                                       .search = search,
                                        .atend = true,
-                                       .k = p + 1
+                                       .skip = skip,
+                                       .k = p,
+                                       .case_ignore = caseignore,
                                };
-                               if (caseignore) {
-                                       n->case_ignore = true;
-                               }
                        }
                        *q = 0;
-                       q = p + 1;
+                       q = p;
+                       continue;                       /* skip increment, we 
already did it */
                } else {
                        *q++ = *p;
                        n->len++;
@@ -765,11 +820,11 @@ pcre_match_with_flags(bit *ret, const ch
 
 #ifdef HAVE_LIBPCRE
 /* special characters in PCRE that need to be escaped */
-static const char *pcre_specials = ".+?*()[]{}|^$\\";
+static const char pcre_specials[] = "$()*+.?[\\]^{|}";
 #else
 /* special characters in POSIX basic regular expressions that need to
  * be escaped */
-static const char *pcre_specials = "^.[$()|*+?{\\";
+static const char pcre_specials[] = "$()*+.?[\\^{|";
 #endif
 
 /* change SQL LIKE pattern into PCRE pattern */
@@ -1055,7 +1110,7 @@ PCREsql2pcre(str *ret, const str *pat, c
 }
 
 static inline str
-choose_like_path(char **ppat, bool *use_re, bool *use_strcmp, bool *empty,
+choose_like_path(bool *use_re, bool *use_strcmp, bool *empty,
                                 const char *pat, const char *esc)
 {
        str res = MAL_SUCCEED;
@@ -1074,17 +1129,8 @@ choose_like_path(char **ppat, bool *use_
                if (is_strcmpable(pat, esc)) {
                        *use_re = true;
                        *use_strcmp = true;
-               } else if (re_simple(pat, (unsigned char) *esc)) {
+               } else {
                        *use_re = true;
-               } else {
-                       if ((res = sql2pcre(ppat, pat, esc)) != MAL_SUCCEED)
-                               return res;
-                       if (strNil(*ppat)) {
-                               GDKfree(*ppat);
-                               *ppat = NULL;
-                               *use_re = true;
-                               *use_strcmp = true;
-                       }
                }
        }
        return res;
@@ -1095,11 +1141,10 @@ PCRElike_imp(bit *ret, const str *s, con
                         const bit *isens)
 {
        str res = MAL_SUCCEED;
-       char *ppat = NULL;
        bool use_re = false, use_strcmp = false, empty = false;
        struct RE *re = NULL;
 
-       if ((res = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
+       if ((res = choose_like_path(&use_re, &use_strcmp, &empty,
                                                                *pat, *esc)) != 
MAL_SUCCEED)
                return res;
 
@@ -1110,7 +1155,7 @@ PCRElike_imp(bit *ret, const str *s, con
 
        if (strNil(*s) || empty) {
                *ret = bit_nil;
-       } else if (use_re) {
+       } else {
                if (use_strcmp) {
                        *ret = *isens ? GDKstrcasecmp(*s, *pat) == 0
                                : strcmp(*s, *pat) == 0;
@@ -1121,13 +1166,10 @@ PCRElike_imp(bit *ret, const str *s, con
                        else
                                *ret = re_match(*s, re);
                }
-       } else {
-               res = *isens ? PCREimatch(ret, s, &ppat) : PCREmatch(ret, s, 
&ppat);
        }
 
        if (re)
                re_destroy(re);
-       GDKfree(ppat);
        return res;
 }
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to