Changeset: e7382ec341ec for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/e7382ec341ec Modified Files: monetdb5/modules/mal/pcre.c Branch: ascii-flag Log Message:
Implemented LIKE _ matching without PCRE. diffs (truncated from 750 to 300 lines): diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c --- a/monetdb5/modules/mal/pcre.c +++ b/monetdb5/modules/mal/pcre.c @@ -53,40 +53,15 @@ typedef regex_t pcre; struct RE { char *k; bool search:1, atend:1, case_ignore:1; - size_t len; /* number of bytes in string */ - size_t ulen; /* number of codepoints in string */ + size_t skip; /* number of codepoints to skip before matching */ + size_t len; /* number of bytes in string */ + size_t ulen; /* number of codepoints in string */ struct RE *n; }; /* We cannot use strcasecmp and strncasecmp since they work byte for * byte and don't deal with multibyte encodings (such as UTF-8). */ -/* returns true if the pattern does not contain unescaped `_' (single - * character match) and ends with unescaped `%' (any sequence - * match) */ -static inline bool -re_simple(const char *pat, unsigned char esc) -{ - bool escaped = false; - - if (pat == 0) - return false; - if (*pat == '%') { - pat++; - } - while (*pat) { - if (escaped) { - escaped = false; - } else if ((unsigned char) *pat == esc) { - escaped = true; - } else if (*pat == '_') { - return false; - } - pat++; - } - return true; -} - static inline bool re_is_pattern_properly_escaped(const char *pat, unsigned char esc) { @@ -124,14 +99,77 @@ re_match(const char *restrict s, const s const struct RE *r; for (r = pattern; r; r = r->n) { - if (*r->k == 0 && (r->search || *s == 0)) + for (size_t i = 0; i < r->skip; s++) { + if (*s == 0) + return false; + i += (*s & 0xC0) != 0x80; + } + if (r->search) { + if (r->atend) { + /* we're searching for a string at the end, so just skip + * over everything and just compare with the tail of the + * haystack */ + size_t slen = strlen(s); + if (slen < r->ulen) { + /* remaining string too short: each codepoint + * requires at least one byte */ + return false; + } + const char *e = s + slen; + if (!r->case_ignore) { + if (slen < r->len) { + /* remaining string is too short to match */ + return false; + } + e -= r->len; + if ((*e & 0xC0) == 0x80) { + /* not at start of a Unicode character, so + * cannot match (this test not strictly + * required: the strcmp should also return + * unequal) */ + return false; + } + return strcmp(e, r->k) == 0; + } + size_t ulen = r->ulen; + while (e > s && ulen != 0) { + ulen -= (*--e & 0xC0) != 0x80; + } + /* ulen != 0 means remaining string is too short */ + return ulen == 0 && GDKstrcasecmp(e, r->k) == 0; + } + /* in case we have a pattern consisting of % followed by _, + * we need to backtrack, so use recursion; here we know we + * have the %, look for an _ in the rest of the pattern + * (note %_ and _% are equivalent and is taken care of by + * the pattern construction in re_create) */ + for (const struct RE *p = r->n; p; p = p->n) { + if (p->skip != 0) { + struct RE pat = *r; + pat.search = false; + pat.skip = 0; + do { + if (re_match(s, &pat)) + return true; + do + s++; + while (*s && (*s & 0xC0) == 0x80); + } while (*s != 0); + return false; + } + } + } + if (r->k[0] == 0 && (r->search || *s == 0)) return true; if (r->case_ignore) { for (;;) { + if (r->search && (s = GDKstrcasestr(s, r->k)) == NULL) + return false; if (*s == '\0') return false; /* in "atend" comparison, include NUL byte in the compare */ - if (GDKstrncasecmp(s, r->k, SIZE_MAX, r->len + r->atend) != 0) { + if ((!r->search || r->atend) && + GDKstrncasecmp(s, r->k, SIZE_MAX, r->len + r->atend) != 0) { /* no match */ if (!r->search) return false; @@ -147,14 +185,14 @@ re_match(const char *restrict s, const s break; } } else { - /* search for first byte in pattern */ - if (r->search && (s = strchr(s, r->k[0])) == NULL) - return false; for (;;) { + if (r->search && (s = strstr(s, r->k)) == NULL) + return false; if (*s == '\0') return false; /* in "atend" comparison, include NUL byte in the compare */ - if (strncmp(s, r->k, r->len + r->atend) != 0) { + if ((!r->search || r->atend) && + strncmp(s, r->k, r->len + r->atend) != 0) { /* no match */ if (!r->search) return false; @@ -204,18 +242,26 @@ re_create(const char *pat, bool caseigno if (r == NULL) return NULL; - *r = (struct RE) {.atend = true }; + *r = (struct RE) { + .atend = true, + .case_ignore = caseignore, + }; - while (esc != '%' && *pat == '%') { - pat++; /* skip % */ - r->search = true; + for (;;) { + if (esc != '%' && *pat == '%') { + pat++; /* skip % */ + r->search = true; + } else if (esc != '_' && *pat == '_') { + pat++; + r->skip++; + } else { + break; + } } if ((p = GDKstrdup(pat)) == NULL) { GDKfree(r); return NULL; } - if (caseignore) - n->case_ignore = true; r->k = p; q = p; @@ -227,25 +273,34 @@ re_create(const char *pat, bool caseigno escaped = false; } else if ((unsigned char) *p == esc) { escaped = true; - } else if (*p == '%') { + } else if (*p == '%' || *p == '_') { n->atend = false; - while (p[1] == '%') + bool search = false; + size_t skip = 0; + for (;;) { + if (*p == '_') + skip++; + else if (*p == '%') + search = true; + else + break; p++; - if (p[1]) { + } + if (*p || skip != 0) { n = n->n = GDKmalloc(sizeof(struct RE)); if (n == NULL) goto bailout; *n = (struct RE) { - .search = true, + .search = search, .atend = true, - .k = p + 1 + .skip = skip, + .k = p, + .case_ignore = caseignore, }; - if (caseignore) { - n->case_ignore = true; - } } *q = 0; - q = p + 1; + q = p; + continue; /* skip increment, we already did it */ } else { *q++ = *p; n->len++; @@ -765,11 +820,11 @@ pcre_match_with_flags(bit *ret, const ch #ifdef HAVE_LIBPCRE /* special characters in PCRE that need to be escaped */ -static const char *pcre_specials = ".+?*()[]{}|^$\\"; +static const char pcre_specials[] = "$()*+.?[\\]^{|}"; #else /* special characters in POSIX basic regular expressions that need to * be escaped */ -static const char *pcre_specials = "^.[$()|*+?{\\"; +static const char pcre_specials[] = "$()*+.?[\\^{|"; #endif /* change SQL LIKE pattern into PCRE pattern */ @@ -1055,7 +1110,7 @@ PCREsql2pcre(str *ret, const str *pat, c } static inline str -choose_like_path(char **ppat, bool *use_re, bool *use_strcmp, bool *empty, +choose_like_path(bool *use_re, bool *use_strcmp, bool *empty, const char *pat, const char *esc) { str res = MAL_SUCCEED; @@ -1074,17 +1129,8 @@ choose_like_path(char **ppat, bool *use_ if (is_strcmpable(pat, esc)) { *use_re = true; *use_strcmp = true; - } else if (re_simple(pat, (unsigned char) *esc)) { + } else { *use_re = true; - } else { - if ((res = sql2pcre(ppat, pat, esc)) != MAL_SUCCEED) - return res; - if (strNil(*ppat)) { - GDKfree(*ppat); - *ppat = NULL; - *use_re = true; - *use_strcmp = true; - } } } return res; @@ -1095,11 +1141,10 @@ PCRElike_imp(bit *ret, const str *s, con const bit *isens) { str res = MAL_SUCCEED; - char *ppat = NULL; bool use_re = false, use_strcmp = false, empty = false; struct RE *re = NULL; - if ((res = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, + if ((res = choose_like_path(&use_re, &use_strcmp, &empty, *pat, *esc)) != MAL_SUCCEED) return res; @@ -1110,7 +1155,7 @@ PCRElike_imp(bit *ret, const str *s, con if (strNil(*s) || empty) { *ret = bit_nil; - } else if (use_re) { + } else { if (use_strcmp) { *ret = *isens ? GDKstrcasecmp(*s, *pat) == 0 : strcmp(*s, *pat) == 0; @@ -1121,13 +1166,10 @@ PCRElike_imp(bit *ret, const str *s, con else *ret = re_match(*s, re); } - } else { - res = *isens ? PCREimatch(ret, s, &ppat) : PCREmatch(ret, s, &ppat); } if (re) re_destroy(re); - GDKfree(ppat); return res; } _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org