Changeset: c70c5c521c4d for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c70c5c521c4d Modified Files: monetdb5/mal/Tests/tst201.malC monetdb5/modules/mal/pcre.c Branch: Jun2020 Log Message:
Also don't use PCRE for patterns without _ not ending in %. diffs (truncated from 366 to 300 lines): diff --git a/monetdb5/mal/Tests/tst201.malC b/monetdb5/mal/Tests/tst201.malC --- a/monetdb5/mal/Tests/tst201.malC +++ b/monetdb5/mal/Tests/tst201.malC @@ -61,14 +61,14 @@ exit MALException; v45:= algebra.slice(b,v43,v44); v46:= io.print(v45); - v75:= algebra.likeselect(b, nil:bat[:oid], "ox", nil:str, false, false); + v75:= algebra.likeselect(b, nil:bat[:oid], "ox", "", false, false); v76:= io.print(v75); - v78:= algebra.likeselect(b, nil:bat[:oid], "fo", nil:str, false, false); + v78:= algebra.likeselect(b, nil:bat[:oid], "fo", "", false, false); v79:= io.print(v78); - v81:= algebra.likeselect(b, nil:bat[:oid], "fox", nil:str, false, false); + v81:= algebra.likeselect(b, nil:bat[:oid], "fox", "", false, false); v82:= io.print(v81); - v84:= algebra.likeselect(b, nil:bat[:oid], " fox", nil:str, false, false); + v84:= algebra.likeselect(b, nil:bat[:oid], " fox", "", false, false); v85:= io.print(v84); diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c --- a/monetdb5/modules/mal/pcre.c +++ b/monetdb5/modules/mal/pcre.c @@ -85,13 +85,14 @@ mal_export str ILIKEjoin(bat *r1, bat *r mal_export str ILIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate); /* current implementation assumes simple %keyword% [keyw%]* */ -typedef struct RE { +struct RE { char *k; uint32_t *w; - bool search; + bool search:1, + atend:1; size_t len; struct RE *n; -} RE; +}; /* We cannot use strcasecmp and strncasecmp since they work byte for * byte and don't deal with multibyte encodings (such as UTF-8). @@ -233,32 +234,32 @@ myucslen(const uint32_t *ucs) return i; } -static inline int -mywstrncasecmp(const char *restrict s1, const uint32_t *restrict s2, size_t n2) +static inline bool +mywstrncaseeq(const char *restrict s1, const uint32_t *restrict s2, size_t n2, bool atend) { uint32_t c1; while (n2 > 0) { size_t nn1 = utfc8touc(&c1, s1); if (nn1 == 0 || nn1 == (size_t) -1) - return -(*s2 != 0); + return (*s2 == 0); if (*s2 == 0) - return 1; + return false; if (nn1 == (size_t) -1 || nn1 == (size_t) -2) - return 0; /* actually an error that shouldn't happen */ + return true; /* actually an error that shouldn't happen */ #if SIZEOF_WCHAR_T == 2 if (c1 > 0xFFFF || *s2 > 0xFFFF) { if (c1 != *s2) - return c1 - *s2; + return false; } else #endif if (towlower((wint_t) c1) != towlower((wint_t) *s2)) - return towlower((wint_t) c1) - towlower((wint_t) *s2); + return false; s1 += nn1; n2--; s2++; } - return 0; + return !atend || *s1 == 0; } static inline int @@ -316,12 +317,12 @@ mywstrcasecmp(const char *restrict s1, c } static inline const char * -mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle) +mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle, bool atend) { size_t nlen = myucslen(wneedle); if (nlen == 0) - return haystack; + return atend ? haystack + strlen(haystack) : haystack; size_t hlen = strlen(haystack); @@ -347,7 +348,7 @@ mywstrcasestr(const char *restrict hayst break; h += j; } - if (i == nlen) + if (i == nlen && (!atend || haystack[h] == 0)) return haystack; haystack += step; hlen -= step; @@ -362,31 +363,26 @@ static bool re_simple(const char *pat, unsigned char esc) { bool escaped = false; - bool percatend = false; if (pat == 0) - return 0; + return false; if (*pat == '%') { - percatend = true; pat++; } while (*pat) { - percatend = false; if (escaped) { escaped = false; } else if ((unsigned char) *pat == esc) { escaped = true; } else if (*pat == '_') { - return 0; - } else if (*pat == '%') { - percatend = true; + return false; } pat++; } - return percatend; + return true; } -static bool +static inline bool is_strcmpable(const char *pat, const char *esc) { if (pat[strcspn(pat, "%_")]) @@ -395,15 +391,17 @@ is_strcmpable(const char *pat, const cha } static inline bool -re_match_ignore(const char *s, RE *pattern) +re_match_ignore(const char *restrict s, const struct RE *restrict pattern) { - RE *r; + const struct RE *r; for (r = pattern; r; r = r->n) { if (*r->w == 0 && (r->search || *s == 0)) return true; if (!*s || - (r->search ? (s = mywstrcasestr(s, r->w)) == NULL : mywstrncasecmp(s, r->w, r->len) != 0)) + (r->search + ? (s = mywstrcasestr(s, r->w, r->atend)) == NULL + : !mywstrncaseeq(s, r->w, r->len, r->atend))) return false; s += r->len; } @@ -411,15 +409,22 @@ re_match_ignore(const char *s, RE *patte } static inline bool -re_match_no_ignore(const char *s, RE *pattern) +re_match_no_ignore(const char *restrict s, const struct RE *restrict pattern) { - RE *r; + const struct RE *r; + size_t l; for (r = pattern; r; r = r->n) { if (*r->k == 0 && (r->search || *s == 0)) return true; if (!*s || - (r->search ? (s = strstr(s, r->k)) == NULL : strncmp(s, r->k, r->len) != 0)) + (r->search + ? (r->atend + ? (l = strlen(s)) < r->len || strcmp(s + l - r->len, r->k) != 0 + : (s = strstr(s, r->k)) == NULL) + : (r->atend + ? strcmp(s, r->k) != 0 + : strncmp(s, r->k, r->len) != 0))) return false; s += r->len; } @@ -427,13 +432,13 @@ re_match_no_ignore(const char *s, RE *pa } static void -re_destroy(RE *p) +re_destroy(struct RE *p) { if (p) { GDKfree(p->k); GDKfree(p->w); do { - RE *n = p->n; + struct RE *n = p->n; GDKfree(p); p = n; @@ -446,15 +451,15 @@ re_destroy(RE *p) * fields in the first structure are allocated, whereas in all * subsequent structures the fields point into the allocated buffer of * the first. */ -static RE * +static struct RE * re_create(const char *pat, bool caseignore, uint32_t esc) { - RE *r = (RE*)GDKmalloc(sizeof(RE)), *n = r; + struct RE *r = GDKmalloc(sizeof(struct RE)), *n = r; bool escaped = false; if (r == NULL) return NULL; - *r = (struct RE) {.search = false}; + *r = (struct RE) {.atend = true}; while (esc != '%' && *pat == '%') { pat++; /* skip % */ @@ -473,25 +478,28 @@ re_create(const char *pat, bool caseigno while (*wp) { if (escaped) { *wq++ = *wp; + n->len++; escaped = false; } else if (*wp == esc) { escaped = true; } else if (*wp == '%') { - n->len = (size_t) (wq - n->w); + n->atend = false; while (wp[1] == '%') wp++; if (wp[1]) { - n = n->n = GDKmalloc(sizeof(RE)); + n = n->n = GDKmalloc(sizeof(struct RE)); if (n == NULL) goto bailout; - *n = (struct RE) {.search = true, .w = wp + 1}; + *n = (struct RE) {.search = true, .atend = true, .w = wp + 1}; } *wq++ = 0; } else { *wq++ = *wp; + n->len++; } wp++; } + *wq = 0; } else { char *p, *q; if ((p = GDKstrdup(pat)) == NULL) { @@ -503,25 +511,28 @@ re_create(const char *pat, bool caseigno while (*p) { if (escaped) { *q++ = *p; + n->len++; escaped = false; } else if ((unsigned char) *p == esc) { escaped = true; } else if (*p == '%') { - n->len = (size_t) (q - n->k); + n->atend = false; while (p[1] == '%') p++; if (p[1]) { - n = n->n = GDKmalloc(sizeof(RE)); + n = n->n = GDKmalloc(sizeof(struct RE)); if (n == NULL) goto bailout; - *n = (struct RE) {.search = true, .k = p + 1}; + *n = (struct RE) {.search = true, .atend = true, .k = p + 1}; } *q++ = 0; } else { *q++ = *p; + n->len++; } p++; } + *q = 0; } return r; bailout: @@ -713,7 +724,7 @@ re_likeselect(BAT **bnp, BAT *b, BAT *s, BUN p, q; oid o, off; const char *v; - RE *re = NULL; + struct RE *re = NULL; uint32_t *wpat = NULL; assert(ATOMstorage(b->ttype) == TYPE_str); @@ -765,14 +776,14 @@ re_likeselect(BAT **bnp, BAT *b, BAT *s, _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list