I wrote: > I've also decided after reflection that the patch should indeed > create a named "word" character class. That's allowed per POSIX, > and it simplifies some aspects of the documentation, since we can > rely on referencing the class instead of repeating ourselves. > The attached 0001 v2 does that; it's otherwise the same as before.
Sigh, this time with the attachments ... regards, tom lane
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index d8224272a5..860ae11826 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -6097,6 +6097,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; non-ASCII characters to belong to any of these classes.) In addition to these standard character classes, <productname>PostgreSQL</productname> defines + the <literal>word</literal> character class, which is the same as + <literal>alnum</literal> plus the underscore (<literal>_</literal>) + character, and the <literal>ascii</literal> character class, which contains exactly the 7-bit ASCII set. </para> @@ -6108,9 +6111,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; matching empty strings at the beginning and end of a word respectively. A word is defined as a sequence of word characters that is neither preceded nor followed by word - characters. A word character is an <literal>alnum</literal> character (as - defined by the <acronym>POSIX</acronym> character class described above) - or an underscore. This is an extension, compatible with but not + characters. A word character is any character belonging to the + <literal>word</literal> character class, that is, any letter, digit, + or underscore. This is an extension, compatible with but not specified by <acronym>POSIX</acronym> 1003.2, and should be used with caution in software intended to be portable to other systems. The constraint escapes described below are usually preferable; they @@ -6330,8 +6333,7 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; <row> <entry> <literal>\w</literal> </entry> - <entry> <literal>[[:alnum:]_]</literal> - (note underscore is included) </entry> + <entry> <literal>[[:word:]]</literal> </entry> </row> <row> @@ -6346,21 +6348,18 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; <row> <entry> <literal>\W</literal> </entry> - <entry> <literal>[^[:alnum:]_]</literal> - (note underscore is included) </entry> + <entry> <literal>[^[:word:]]</literal> </entry> </row> </tbody> </tgroup> </table> <para> - Within bracket expressions, <literal>\d</literal>, <literal>\s</literal>, - and <literal>\w</literal> lose their outer brackets, - and <literal>\D</literal>, <literal>\S</literal>, and <literal>\W</literal> are illegal. - (So, for example, <literal>[a-c\d]</literal> is equivalent to + The class-shorthand escapes also work within bracket expressions, + although the definitions shown above are not quite syntactically + valid in that context. + For example, <literal>[a-c\d]</literal> is equivalent to <literal>[a-c[:digit:]]</literal>. - Also, <literal>[a-c\D]</literal>, which is equivalent to - <literal>[a-c^[:digit:]]</literal>, is illegal.) </para> <table id="posix-constraint-escapes-table"> diff --git a/src/backend/regex/re_syntax.n b/src/backend/regex/re_syntax.n index 4621bfc25f..1afaa7cce7 100644 --- a/src/backend/regex/re_syntax.n +++ b/src/backend/regex/re_syntax.n @@ -519,15 +519,10 @@ character classes: (note underscore) .RE .PP -Within bracket expressions, `\fB\ed\fR', `\fB\es\fR', -and `\fB\ew\fR'\& -lose their outer brackets, -and `\fB\eD\fR', `\fB\eS\fR', -and `\fB\eW\fR'\& -are illegal. -.VS 8.2 -(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR. -Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.) +The class-shorthand escapes also work within bracket expressions, +although the definitions shown above are not quite syntactically +valid in that context. +For example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR. .VE 8.2 .PP A constraint escape (AREs only) is a constraint, diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c index 0864011cce..30bda0e5ad 100644 --- a/src/backend/regex/regc_color.c +++ b/src/backend/regex/regc_color.c @@ -936,7 +936,16 @@ okcolors(struct nfa *nfa, } else if (cd->nschrs == 0 && cd->nuchrs == 0) { - /* parent empty, its arcs change color to subcolor */ + /* + * Parent is now empty, so just change all its arcs to the + * subcolor, then free the parent. + * + * It is not obvious that simply relabeling the arcs like this is + * OK; it appears to risk creating duplicate arcs. We are + * basically relying on the assumption that processing of a + * bracket expression can't create arcs of both a color and its + * subcolor between the bracket's endpoints. + */ cd->sub = NOSUB; scd = &cm->cd[sco]; assert(scd->nschrs > 0 || scd->nuchrs > 0); @@ -1062,6 +1071,7 @@ colorcomplement(struct nfa *nfa, struct colordesc *cd; struct colordesc *end = CDEND(cm); color co; + struct arc *a; assert(of != from); @@ -1069,10 +1079,26 @@ colorcomplement(struct nfa *nfa, if (findarc(of, PLAIN, RAINBOW) != NULL) return; + /* Otherwise, transiently mark the colors that appear in of's out-arcs */ + for (a = of->outs; a != NULL; a = a->outchain) + { + if (a->type == PLAIN) + { + assert(a->co >= 0); + cd = &cm->cd[a->co]; + assert(!UNUSEDCOLOR(cd)); + cd->flags |= COLMARK; + } + } + + /* Scan colors, clear transient marks, add arcs for unmarked colors */ for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++) - if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)) - if (findarc(of, PLAIN, co) == NULL) - newarc(nfa, type, co, from, to); + { + if (cd->flags & COLMARK) + cd->flags &= ~COLMARK; + else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)) + newarc(nfa, type, co, from, to); + } } diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c index 1666453164..7673dab76f 100644 --- a/src/backend/regex/regc_lex.c +++ b/src/backend/regex/regc_lex.c @@ -193,83 +193,6 @@ prefixes(struct vars *v) } } -/* - * lexnest - "call a subroutine", interpolating string at the lexical level - * - * Note, this is not a very general facility. There are a number of - * implicit assumptions about what sorts of strings can be subroutines. - */ -static void -lexnest(struct vars *v, - const chr *beginp, /* start of interpolation */ - const chr *endp) /* one past end of interpolation */ -{ - assert(v->savenow == NULL); /* only one level of nesting */ - v->savenow = v->now; - v->savestop = v->stop; - v->now = beginp; - v->stop = endp; -} - -/* - * string constants to interpolate as expansions of things like \d - */ -static const chr backd[] = { /* \d */ - CHR('['), CHR('['), CHR(':'), - CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr backD[] = { /* \D */ - CHR('['), CHR('^'), CHR('['), CHR(':'), - CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr brbackd[] = { /* \d within brackets */ - CHR('['), CHR(':'), - CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), - CHR(':'), CHR(']') -}; -static const chr backs[] = { /* \s */ - CHR('['), CHR('['), CHR(':'), - CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr backS[] = { /* \S */ - CHR('['), CHR('^'), CHR('['), CHR(':'), - CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr brbacks[] = { /* \s within brackets */ - CHR('['), CHR(':'), - CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), - CHR(':'), CHR(']') -}; -static const chr backw[] = { /* \w */ - CHR('['), CHR('['), CHR(':'), - CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_'), CHR(']') -}; -static const chr backW[] = { /* \W */ - CHR('['), CHR('^'), CHR('['), CHR(':'), - CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_'), CHR(']') -}; -static const chr brbackw[] = { /* \w within brackets */ - CHR('['), CHR(':'), - CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_') -}; - -/* - * lexword - interpolate a bracket expression for word characters - * Possibly ought to inquire whether there is a "word" character class. - */ -static void -lexword(struct vars *v) -{ - lexnest(v, backw, ENDOF(backw)); -} - /* * next - get next token */ @@ -292,14 +215,6 @@ next(struct vars *v) RETV(SBEGIN, 0); /* same as \A */ } - /* if we're nested and we've hit end, return to outer level */ - if (v->savenow != NULL && ATEOS()) - { - v->now = v->savenow; - v->stop = v->savestop; - v->savenow = v->savestop = NULL; - } - /* skip white space etc. if appropriate (not in literal or []) */ if (v->cflags & REG_EXPANDED) switch (v->lexcon) @@ -420,32 +335,15 @@ next(struct vars *v) NOTE(REG_UNONPOSIX); if (ATEOS()) FAILW(REG_EESCAPE); - (DISCARD) lexescape(v); + if (!lexescape(v)) + return 0; switch (v->nexttype) { /* not all escapes okay here */ case PLAIN: + case CCLASSS: + case CCLASSC: return 1; break; - case CCLASS: - switch (v->nextvalue) - { - case 'd': - lexnest(v, brbackd, ENDOF(brbackd)); - break; - case 's': - lexnest(v, brbacks, ENDOF(brbacks)); - break; - case 'w': - lexnest(v, brbackw, ENDOF(brbackw)); - break; - default: - FAILW(REG_EESCAPE); - break; - } - /* lexnest done, back up and try again */ - v->nexttype = v->lasttype; - return next(v); - break; } /* not one of the acceptable escapes */ FAILW(REG_EESCAPE); @@ -691,49 +589,17 @@ next(struct vars *v) } RETV(PLAIN, *v->now++); } - (DISCARD) lexescape(v); - if (ISERR()) - FAILW(REG_EESCAPE); - if (v->nexttype == CCLASS) - { /* fudge at lexical level */ - switch (v->nextvalue) - { - case 'd': - lexnest(v, backd, ENDOF(backd)); - break; - case 'D': - lexnest(v, backD, ENDOF(backD)); - break; - case 's': - lexnest(v, backs, ENDOF(backs)); - break; - case 'S': - lexnest(v, backS, ENDOF(backS)); - break; - case 'w': - lexnest(v, backw, ENDOF(backw)); - break; - case 'W': - lexnest(v, backW, ENDOF(backW)); - break; - default: - assert(NOTREACHED); - FAILW(REG_ASSERT); - break; - } - /* lexnest done, back up and try again */ - v->nexttype = v->lasttype; - return next(v); - } - /* otherwise, lexescape has already done the work */ - return !ISERR(); + return lexescape(v); } /* * lexescape - parse an ARE backslash escape (backslash already eaten) - * Note slightly nonstandard use of the CCLASS type code. + * + * This is used for ARE backslashes both normally and inside bracket + * expressions. In the latter case, not all escape types are allowed, + * but the caller must reject unwanted ones after we return. */ -static int /* not actually used, but convenient for RETV */ +static int lexescape(struct vars *v) { chr c; @@ -775,11 +641,11 @@ lexescape(struct vars *v) break; case CHR('d'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'd'); + RETV(CCLASSS, CC_DIGIT); break; case CHR('D'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'D'); + RETV(CCLASSC, CC_DIGIT); break; case CHR('e'): NOTE(REG_UUNPORT); @@ -802,11 +668,11 @@ lexescape(struct vars *v) break; case CHR('s'): NOTE(REG_ULOCALE); - RETV(CCLASS, 's'); + RETV(CCLASSS, CC_SPACE); break; case CHR('S'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'S'); + RETV(CCLASSC, CC_SPACE); break; case CHR('t'): RETV(PLAIN, CHR('\t')); @@ -828,11 +694,11 @@ lexescape(struct vars *v) break; case CHR('w'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'w'); + RETV(CCLASSS, CC_WORD); break; case CHR('W'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'W'); + RETV(CCLASSC, CC_WORD); break; case CHR('x'): NOTE(REG_UUNPORT); diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c index 047abc3e1e..b5f3a73b1b 100644 --- a/src/backend/regex/regc_locale.c +++ b/src/backend/regex/regc_locale.c @@ -350,17 +350,13 @@ static const struct cname }; /* - * The following arrays define the valid character class names. + * The following array defines the valid character class names. + * The entries must match enum char_classes in regguts.h. */ static const char *const classNames[NUM_CCLASSES + 1] = { "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", - "lower", "print", "punct", "space", "upper", "xdigit", NULL -}; - -enum classes -{ - CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, - CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT + "lower", "print", "punct", "space", "upper", "xdigit", "word", + NULL }; /* @@ -536,54 +532,58 @@ eclass(struct vars *v, /* context */ } /* - * cclass - supply cvec for a character class - * - * Must include case counterparts if "cases" is true. + * lookupcclass - lookup a character class identified by name * - * The returned cvec might be either a transient cvec gotten from getcvec(), - * or a permanently cached one from pg_ctype_get_cache(). This is okay - * because callers are not supposed to explicitly free the result either way. + * On failure, sets an error code in *v; the result is then garbage. */ -static struct cvec * -cclass(struct vars *v, /* context */ - const chr *startp, /* where the name starts */ - const chr *endp, /* just past the end of the name */ - int cases) /* case-independent? */ +static enum char_classes +lookupcclass(struct vars *v, /* context (for returning errors) */ + const chr *startp, /* where the name starts */ + const chr *endp) /* just past the end of the name */ { size_t len; - struct cvec *cv = NULL; const char *const *namePtr; - int i, - index; + int i; /* * Map the name to the corresponding enumerated value. */ len = endp - startp; - index = -1; for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++) { if (strlen(*namePtr) == len && pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) - { - index = i; - break; - } - } - if (index == -1) - { - ERR(REG_ECTYPE); - return NULL; + return (enum char_classes) i; } + ERR(REG_ECTYPE); + return (enum char_classes) 0; +} + +/* + * cclasscvec - supply cvec for a character class + * + * Must include case counterparts if "cases" is true. + * + * The returned cvec might be either a transient cvec gotten from getcvec(), + * or a permanently cached one from pg_ctype_get_cache(). This is okay + * because callers are not supposed to explicitly free the result either way. + */ +static struct cvec * +cclasscvec(struct vars *v, /* context */ + enum char_classes cclasscode, /* class to build a cvec for */ + int cases) /* case-independent? */ +{ + struct cvec *cv = NULL; + /* * Remap lower and upper to alpha if the match is case insensitive. */ if (cases && - ((enum classes) index == CC_LOWER || - (enum classes) index == CC_UPPER)) - index = (int) CC_ALPHA; + (cclasscode == CC_LOWER || + cclasscode == CC_UPPER)) + cclasscode = CC_ALPHA; /* * Now compute the character class contents. For classes that are based @@ -595,16 +595,19 @@ cclass(struct vars *v, /* context */ * NB: keep this code in sync with cclass_column_index(), below. */ - switch ((enum classes) index) + switch (cclasscode) { case CC_PRINT: - cv = pg_ctype_get_cache(pg_wc_isprint, index); + cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode); break; case CC_ALNUM: - cv = pg_ctype_get_cache(pg_wc_isalnum, index); + cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode); break; case CC_ALPHA: - cv = pg_ctype_get_cache(pg_wc_isalpha, index); + cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode); + break; + case CC_WORD: + cv = pg_ctype_get_cache(pg_wc_isword, cclasscode); break; case CC_ASCII: /* hard-wired meaning */ @@ -625,10 +628,10 @@ cclass(struct vars *v, /* context */ addrange(cv, 0x7f, 0x9f); break; case CC_DIGIT: - cv = pg_ctype_get_cache(pg_wc_isdigit, index); + cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode); break; case CC_PUNCT: - cv = pg_ctype_get_cache(pg_wc_ispunct, index); + cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode); break; case CC_XDIGIT: @@ -646,16 +649,16 @@ cclass(struct vars *v, /* context */ } break; case CC_SPACE: - cv = pg_ctype_get_cache(pg_wc_isspace, index); + cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode); break; case CC_LOWER: - cv = pg_ctype_get_cache(pg_wc_islower, index); + cv = pg_ctype_get_cache(pg_wc_islower, cclasscode); break; case CC_UPPER: - cv = pg_ctype_get_cache(pg_wc_isupper, index); + cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode); break; case CC_GRAPH: - cv = pg_ctype_get_cache(pg_wc_isgraph, index); + cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode); break; } @@ -678,7 +681,7 @@ cclass_column_index(struct colormap *cm, chr c) /* * Note: we should not see requests to consider cclasses that are not - * treated as locale-specific by cclass(), above. + * treated as locale-specific by cclasscvec(), above. */ if (cm->classbits[CC_PRINT] && pg_wc_isprint(c)) colnum |= cm->classbits[CC_PRINT]; @@ -686,6 +689,8 @@ cclass_column_index(struct colormap *cm, chr c) colnum |= cm->classbits[CC_ALNUM]; if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c)) colnum |= cm->classbits[CC_ALPHA]; + if (cm->classbits[CC_WORD] && pg_wc_isword(c)) + colnum |= cm->classbits[CC_WORD]; assert(cm->classbits[CC_ASCII] == 0); assert(cm->classbits[CC_BLANK] == 0); assert(cm->classbits[CC_CNTRL] == 0); diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 1fff3df1da..bbbd61c604 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -400,6 +400,15 @@ pg_wc_isalnum(pg_wchar c) return 0; /* can't get here, but keep compiler quiet */ } +static int +pg_wc_isword(pg_wchar c) +{ + /* We define word characters as alnum class plus underscore */ + if (c == CHR('_')) + return 1; + return pg_wc_isalnum(c); +} + static int pg_wc_isupper(pg_wchar c) { diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 0cd4b4c4c2..7b77a29136 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -46,13 +46,18 @@ static struct subre *parsebranch(struct vars *, int, int, struct state *, struct static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *); static void nonword(struct vars *, int, struct state *, struct state *); static void word(struct vars *, int, struct state *, struct state *); +static void charclass(struct vars *, enum char_classes, + struct state *, struct state *); +static void charclasscomplement(struct vars *, enum char_classes, + struct state *, struct state *); static int scannum(struct vars *); static void repeat(struct vars *, struct state *, struct state *, int, int); static void bracket(struct vars *, struct state *, struct state *); static void cbracket(struct vars *, struct state *, struct state *); -static void brackpart(struct vars *, struct state *, struct state *); +static void brackpart(struct vars *, struct state *, struct state *, bool *); static const chr *scanplain(struct vars *); static void onechr(struct vars *, chr, struct state *, struct state *); +static void optimizebracket(struct vars *, struct state *, struct state *); static void wordchrs(struct vars *); static void processlacon(struct vars *, struct state *, struct state *, int, struct state *, struct state *); @@ -81,8 +86,6 @@ static const char *stid(struct subre *, char *, size_t); /* === regc_lex.c === */ static void lexstart(struct vars *); static void prefixes(struct vars *); -static void lexnest(struct vars *, const chr *, const chr *); -static void lexword(struct vars *); static int next(struct vars *); static int lexescape(struct vars *); static chr lexdigits(struct vars *, int, int, int); @@ -206,6 +209,7 @@ static void freecvec(struct cvec *); static int pg_wc_isdigit(pg_wchar c); static int pg_wc_isalpha(pg_wchar c); static int pg_wc_isalnum(pg_wchar c); +static int pg_wc_isword(pg_wchar c); static int pg_wc_isupper(pg_wchar c); static int pg_wc_islower(pg_wchar c); static int pg_wc_isgraph(pg_wchar c); @@ -220,7 +224,8 @@ static chr element(struct vars *, const chr *, const chr *); static struct cvec *range(struct vars *, chr, chr, int); static int before(chr, chr); static struct cvec *eclass(struct vars *, chr, int); -static struct cvec *cclass(struct vars *, const chr *, const chr *, int); +static enum char_classes lookupcclass(struct vars *, const chr *, const chr *); +static struct cvec *cclasscvec(struct vars *, enum char_classes, int); static int cclass_column_index(struct colormap *, chr); static struct cvec *allcases(struct vars *, chr); static int cmp(const chr *, const chr *, size_t); @@ -233,14 +238,12 @@ struct vars regex_t *re; const chr *now; /* scan pointer into string */ const chr *stop; /* end of string */ - const chr *savenow; /* saved now and stop for "subroutine call" */ - const chr *savestop; int err; /* error code (0 if none) */ int cflags; /* copy of compile flags */ int lasttype; /* type of previous token */ int nexttype; /* type of next token */ chr nextvalue; /* value (if any) of next token */ - int lexcon; /* lexical context type (see lex.c) */ + int lexcon; /* lexical context type (see regc_lex.c) */ int nsubexp; /* subexpression count */ struct subre **subs; /* subRE pointer vector */ size_t nsubs; /* length of vector */ @@ -287,6 +290,8 @@ struct vars #define ECLASS 'E' /* start of [= */ #define CCLASS 'C' /* start of [: */ #define END 'X' /* end of [. [= [: */ +#define CCLASSS 's' /* char class shorthand escape */ +#define CCLASSC 'c' /* complement char class shorthand escape */ #define RANGE 'R' /* - within [] which might be range delim. */ #define LACON 'L' /* lookaround constraint subRE */ #define AHEAD 'a' /* color-lookahead arc */ @@ -356,7 +361,6 @@ pg_regcomp(regex_t *re, v->re = re; v->now = string; v->stop = v->now + len; - v->savenow = v->savestop = NULL; v->err = 0; v->cflags = flags; v->nsubexp = 0; @@ -835,23 +839,25 @@ parseqatom(struct vars *v, return; break; case '<': - wordchrs(v); /* does NEXT() */ + wordchrs(v); s = newstate(v->nfa); NOERR(); nonword(v, BEHIND, lp, s); word(v, AHEAD, s, rp); + NEXT(); return; break; case '>': - wordchrs(v); /* does NEXT() */ + wordchrs(v); s = newstate(v->nfa); NOERR(); word(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); + NEXT(); return; break; case WBDRY: - wordchrs(v); /* does NEXT() */ + wordchrs(v); s = newstate(v->nfa); NOERR(); nonword(v, BEHIND, lp, s); @@ -860,10 +866,11 @@ parseqatom(struct vars *v, NOERR(); word(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); + NEXT(); return; break; case NWBDRY: - wordchrs(v); /* does NEXT() */ + wordchrs(v); s = newstate(v->nfa); NOERR(); word(v, BEHIND, lp, s); @@ -872,6 +879,7 @@ parseqatom(struct vars *v, NOERR(); nonword(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); + NEXT(); return; break; case LACON: /* lookaround constraint */ @@ -925,6 +933,16 @@ parseqatom(struct vars *v, assert(SEE(']') || ISERR()); NEXT(); break; + case CCLASSS: + charclass(v, (enum char_classes) v->nextvalue, lp, rp); + okcolors(v->nfa, v->cm); + NEXT(); + break; + case CCLASSC: + charclasscomplement(v, (enum char_classes) v->nextvalue, lp, rp); + /* charclasscomplement() did okcolors() internally */ + NEXT(); + break; case '.': rainbow(v->nfa, v->cm, PLAIN, (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS, @@ -1338,6 +1356,75 @@ word(struct vars *v, /* (no need for special attention to \n) */ } +/* + * charclass - generate arcs for a character class + * + * This is used for both atoms (\w and sibling escapes) and for elements + * of bracket expressions. The caller is responsible for calling okcolors() + * at the end of processing the atom or bracket. + */ +static void +charclass(struct vars *v, + enum char_classes cls, + struct state *lp, + struct state *rp) +{ + struct cvec *cv; + + /* obtain possibly-cached cvec for char class */ + NOTE(REG_ULOCALE); + cv = cclasscvec(v, cls, (v->cflags & REG_ICASE)); + NOERR(); + + /* build the arcs; this may cause color splitting */ + subcolorcvec(v, cv, lp, rp); +} + +/* + * charclasscomplement - generate arcs for a complemented character class + * + * This is used for both atoms (\W and sibling escapes) and for elements + * of bracket expressions. In bracket expressions, it is the caller's + * responsibility that there not be any open subcolors when this is called. + */ +static void +charclasscomplement(struct vars *v, + enum char_classes cls, + struct state *lp, + struct state *rp) +{ + struct state *cstate; + struct cvec *cv; + + /* make dummy state to hang temporary arcs on */ + cstate = newstate(v->nfa); + NOERR(); + + /* obtain possibly-cached cvec for char class */ + NOTE(REG_ULOCALE); + cv = cclasscvec(v, cls, (v->cflags & REG_ICASE)); + NOERR(); + + /* build arcs for char class; this may cause color splitting */ + subcolorcvec(v, cv, cstate, cstate); + + /* in NLSTOP mode, ensure newline is not part of the result set */ + if (v->cflags & REG_NLSTOP) + newarc(v->nfa, PLAIN, v->nlcolor, cstate, cstate); + NOERR(); + + /* clean up any subcolors in the arc set */ + okcolors(v->nfa, v->cm); + NOERR(); + + /* now build output arcs for the complement of the char class */ + colorcomplement(v->nfa, v->cm, PLAIN, cstate, lp, rp); + NOERR(); + + /* clean up dummy state */ + dropstate(v->nfa, cstate); +} + /* * scannum - scan a number */ @@ -1456,6 +1543,7 @@ repeat(struct vars *v, /* * bracket - handle non-complemented bracket expression + * * Also called from cbracket for complemented bracket expressions. */ static void @@ -1463,16 +1551,52 @@ bracket(struct vars *v, struct state *lp, struct state *rp) { + /* + * We can't process complemented char classes (e.g. \W) immediately while + * scanning the bracket expression, else color bookkeeping gets confused. + * Instead, remember whether we saw any in have_cclassc[], and process + * them at the end. + */ + bool have_cclassc[NUM_CCLASSES]; + bool any_cclassc; + int i; + + memset(have_cclassc, false, sizeof(have_cclassc)); + assert(SEE('[')); NEXT(); while (!SEE(']') && !SEE(EOS)) - brackpart(v, lp, rp); + brackpart(v, lp, rp, have_cclassc); assert(SEE(']') || ISERR()); + + /* close up open subcolors from the positive bracket elements */ okcolors(v->nfa, v->cm); + NOERR(); + + /* now handle any complemented elements */ + any_cclassc = false; + for (i = 0; i < NUM_CCLASSES; i++) + { + if (have_cclassc[i]) + { + charclasscomplement(v, (enum char_classes) i, lp, rp); + NOERR(); + any_cclassc = true; + } + } + + /* + * If we had any complemented elements, see if we can optimize the bracket + * into a rainbow. Since a complemented element is the only way a WHITE + * arc could get into the result, there's no point in checking otherwise. + */ + if (any_cclassc) + optimizebracket(v, lp, rp); } /* * cbracket - handle complemented bracket expression + * * We do it by calling bracket() with dummy endpoints, and then complementing * the result. The alternative would be to invoke rainbow(), and then delete * arcs as the b.e. is seen... but that gets messy, and is really quite @@ -1496,7 +1620,9 @@ cbracket(struct vars *v, /* * Easy part of complementing, and all there is to do since the MCCE code - * was removed. + * was removed. Note that the result of colorcomplement() cannot be a + * rainbow, since we don't allow empty brackets; so there's no point in + * calling optimizebracket() again. */ colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp); NOERR(); @@ -1511,14 +1637,15 @@ cbracket(struct vars *v, static void brackpart(struct vars *v, struct state *lp, - struct state *rp) + struct state *rp, + bool *have_cclassc) { chr startc; chr endc; struct cvec *cv; + enum char_classes cls; const chr *startp; const chr *endp; - chr c[1]; /* parse something, get rid of special cases, take shortcuts */ switch (v->nexttype) @@ -1528,15 +1655,14 @@ brackpart(struct vars *v, return; break; case PLAIN: - c[0] = v->nextvalue; + startc = v->nextvalue; NEXT(); /* shortcut for ordinary chr (not range) */ if (!SEE(RANGE)) { - onechr(v, c[0], lp, rp); + onechr(v, startc, lp, rp); return; } - startc = element(v, c, c + 1); NOERR(); break; case COLLEL: @@ -1564,9 +1690,20 @@ brackpart(struct vars *v, endp = scanplain(v); INSIST(startp < endp, REG_ECTYPE); NOERR(); - cv = cclass(v, startp, endp, (v->cflags & REG_ICASE)); + cls = lookupcclass(v, startp, endp); NOERR(); - subcolorcvec(v, cv, lp, rp); + charclass(v, cls, lp, rp); + return; + break; + case CCLASSS: + charclass(v, (enum char_classes) v->nextvalue, lp, rp); + NEXT(); + return; + break; + case CCLASSC: + /* we cannot call charclasscomplement() immediately */ + have_cclassc[v->nextvalue] = true; + NEXT(); return; break; default: @@ -1582,9 +1719,8 @@ brackpart(struct vars *v, { case PLAIN: case RANGE: - c[0] = v->nextvalue; + endc = v->nextvalue; NEXT(); - endc = element(v, c, c + 1); NOERR(); break; case COLLEL: @@ -1618,7 +1754,7 @@ brackpart(struct vars *v, /* * scanplain - scan PLAIN contents of [. etc. * - * Certain bits of trickery in lex.c know that this code does not try + * Certain bits of trickery in regc_lex.c know that this code does not try * to look past the final bracket of the [. etc. */ static const chr * /* just after end of sequence */ @@ -1664,39 +1800,98 @@ onechr(struct vars *v, subcolorcvec(v, allcases(v, c), lp, rp); } +/* + * optimizebracket - see if bracket expression can be converted to RAINBOW + * + * Cases such as "[\s\S]" can produce a set of arcs of all colors, which we + * can replace by a single RAINBOW arc for efficiency. (This might seem + * like a silly way to write ".", but it's seemingly a common locution in + * some other flavors of regex, so take the trouble to support it well.) + */ +static void +optimizebracket(struct vars *v, + struct state *lp, + struct state *rp) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(v->cm); + struct arc *a; + bool israinbow; + + /* + * Scan lp's out-arcs and transiently mark the mentioned colors. We + * expect that all of lp's out-arcs are plain, non-RAINBOW arcs to rp. + * (Note: there shouldn't be any pseudocolors yet, but check anyway.) + */ + for (a = lp->outs; a != NULL; a = a->outchain) + { + assert(a->type == PLAIN); + assert(a->co >= 0); /* i.e. not RAINBOW */ + assert(a->to == rp); + cd = &v->cm->cd[a->co]; + assert(!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)); + cd->flags |= COLMARK; + } + + /* Scan colors, clear transient marks, check for unmarked live colors */ + israinbow = true; + for (cd = v->cm->cd; cd < end; cd++) + { + if (cd->flags & COLMARK) + cd->flags &= ~COLMARK; + else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)) + israinbow = false; + } + + /* Can't do anything if not all colors have arcs */ + if (!israinbow) + return; + + /* OK, drop existing arcs and replace with a rainbow */ + while ((a = lp->outs) != NULL) + freearc(v->nfa, a); + newarc(v->nfa, PLAIN, RAINBOW, lp, rp); +} + /* * wordchrs - set up word-chr list for word-boundary stuff, if needed * - * The list is kept as a bunch of arcs between two dummy states; it's - * disposed of by the unreachable-states sweep in NFA optimization. - * Does NEXT(). Must not be called from any unusual lexical context. - * This should be reconciled with the \w etc. handling in lex.c, and - * should be cleaned up to reduce dependencies on input scanning. + * The list is kept as a bunch of circular arcs on an otherwise-unused state. + * + * Note that this must not be called while we have any open subcolors, + * else construction of the list would confuse color bookkeeping. + * Hence, we can't currently apply a similar optimization in + * charclass[complement](), as those need to be usable within bracket + * expressions. */ static void wordchrs(struct vars *v) { - struct state *left; - struct state *right; + struct state *cstate; + struct cvec *cv; if (v->wordchrs != NULL) - { - NEXT(); /* for consistency */ - return; - } + return; /* done already */ - left = newstate(v->nfa); - right = newstate(v->nfa); + /* make dummy state to hang the cache arcs on */ + cstate = newstate(v->nfa); NOERR(); - /* fine point: implemented with [::], and lexer will set REG_ULOCALE */ - lexword(v); - NEXT(); - assert(v->savenow != NULL && SEE('[')); - bracket(v, left, right); - assert((v->savenow != NULL && SEE(']')) || ISERR()); - NEXT(); + + /* obtain possibly-cached cvec for \w characters */ + NOTE(REG_ULOCALE); + cv = cclasscvec(v, CC_WORD, (v->cflags & REG_ICASE)); NOERR(); - v->wordchrs = left; + + /* build the arcs; this may cause color splitting */ + subcolorcvec(v, cv, cstate, cstate); + NOERR(); + + /* close new open subcolors to ensure the cache entry is self-contained */ + okcolors(v->nfa, v->cm); + NOERR(); + + /* success! save the cache pointer */ + v->wordchrs = cstate; } /* diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 306525eb5f..0e76a828f8 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -127,6 +127,18 @@ #define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS))) +/* + * known character classes + */ +enum char_classes +{ + CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, + CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT, CC_WORD +}; + +#define NUM_CCLASSES 14 + + /* * As soon as possible, we map chrs into equivalence classes -- "colors" -- * which are of much more manageable number. @@ -164,12 +176,14 @@ struct colordesc #define NOSUB COLORLESS /* value of "sub" when no open subcolor */ struct arc *arcs; /* chain of all arcs of this color */ chr firstchr; /* simple char first assigned to this color */ - int flags; /* bit values defined next */ + int flags; /* bitmask of the following flags: */ #define FREECOL 01 /* currently free */ #define PSEUDO 02 /* pseudocolor, no real chars */ -#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL) +#define COLMARK 04 /* temporary marker used in some functions */ }; +#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL) + /* * The color map itself * @@ -199,8 +213,6 @@ struct colordesc * appear in increasing chr-value order. */ -#define NUM_CCLASSES 13 /* must match data in regc_locale.c */ - typedef struct colormaprange { chr cmin; /* range represents cmin..cmax inclusive */ diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out index 21282789c2..379e44f0a2 100644 --- a/src/test/modules/test_regex/expected/test_regex.out +++ b/src/test/modules/test_regex/expected/test_regex.out @@ -1970,6 +1970,263 @@ select * from test_regex('a[\w]b', 'axb', 'LPE'); {axb} (2 rows) +-- these should be invalid +select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE'); +ERROR: invalid regular expression: invalid character range +select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE'); +ERROR: invalid regular expression: invalid character range +select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS'); +ERROR: invalid regular expression: invalid character range +select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS'); +ERROR: invalid regular expression: invalid character range +-- test complemented char classes within brackets +select * from test_regex('[\D]', '0123456789abc*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {a} +(2 rows) + +select * from test_regex('[^\D]', 'abc0123456789*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {0} +(2 rows) + +select * from test_regex('[1\D7]', '0123456789abc*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {1} +(2 rows) + +select * from test_regex('[7\D1]', '0123456789abc*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {1} +(2 rows) + +select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {2} +(2 rows) + +select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {2} +(2 rows) + +select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {2} +(2 rows) + +select * from test_regex('\W', '0123456789abc_*', 'LPE'); + test_regex +--------------------------------------------------- + {0,"missing REG_UBBS!",REG_UNONPOSIX,REG_ULOCALE} + {*} +(2 rows) + +select * from test_regex('[\W]', '0123456789abc_*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {*} +(2 rows) + +select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LPE'); + test_regex +---------------------------------------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE,"unexpected REG_UEMPTYMATCH!"} + {"012 3456789abc_*"} +(2 rows) + +-- check char classes' handling of newlines +select * from test_regex('\s+', E'abc \n def', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {" + + "} +(2 rows) + +select * from test_regex('\s+', E'abc \n def', 'nLP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {" + + "} +(2 rows) + +select * from test_regex('[\s]+', E'abc \n def', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {" + + "} +(2 rows) + +select * from test_regex('[\s]+', E'abc \n def', 'nLPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {" + + "} +(2 rows) + +select * from test_regex('\S+', E'abc\ndef', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {abc} +(2 rows) + +select * from test_regex('\S+', E'abc\ndef', 'nLP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {abc} +(2 rows) + +select * from test_regex('[\S]+', E'abc\ndef', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {abc} +(2 rows) + +select * from test_regex('[\S]+', E'abc\ndef', 'nLPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {abc} +(2 rows) + +select * from test_regex('\d+', E'012\n345', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {012} +(2 rows) + +select * from test_regex('\d+', E'012\n345', 'nLP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {012} +(2 rows) + +select * from test_regex('[\d]+', E'012\n345', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {012} +(2 rows) + +select * from test_regex('[\d]+', E'012\n345', 'nLPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {012} +(2 rows) + +select * from test_regex('\D+', E'abc\ndef345', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {"abc + + def"} +(2 rows) + +select * from test_regex('\D+', E'abc\ndef345', 'nLP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {abc} +(2 rows) + +select * from test_regex('[\D]+', E'abc\ndef345', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {"abc + + def"} +(2 rows) + +select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {abc} +(2 rows) + +select * from test_regex('\w+', E'abc_012\ndef', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {abc_012} +(2 rows) + +select * from test_regex('\w+', E'abc_012\ndef', 'nLP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {abc_012} +(2 rows) + +select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {abc_012} +(2 rows) + +select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {abc_012} +(2 rows) + +select * from test_regex('\W+', E'***\n@@@___', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {"*** + + @@@"} +(2 rows) + +select * from test_regex('\W+', E'***\n@@@___', 'nLP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {***} +(2 rows) + +select * from test_regex('[\W]+', E'***\n@@@___', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {"*** + + @@@"} +(2 rows) + +select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {***} +(2 rows) + -- doing 13 "escapes" -- expectError 13.1 & "a\\" EESCAPE select * from test_regex('a\', '', ''); diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql index 31e947ee9c..026c6ee5eb 100644 --- a/src/test/modules/test_regex/sql/test_regex.sql +++ b/src/test/modules/test_regex/sql/test_regex.sql @@ -597,6 +597,51 @@ select * from test_regex('a[\s]b', 'a b', 'LPE'); -- expectMatch 12.18 LPE {a[\w]b} axb axb select * from test_regex('a[\w]b', 'axb', 'LPE'); +-- these should be invalid +select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE'); +select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE'); +select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS'); +select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS'); + +-- test complemented char classes within brackets +select * from test_regex('[\D]', '0123456789abc*', 'LPE'); +select * from test_regex('[^\D]', 'abc0123456789*', 'LPE'); +select * from test_regex('[1\D7]', '0123456789abc*', 'LPE'); +select * from test_regex('[7\D1]', '0123456789abc*', 'LPE'); +select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE'); +select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE'); +select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE'); +select * from test_regex('\W', '0123456789abc_*', 'LPE'); +select * from test_regex('[\W]', '0123456789abc_*', 'LPE'); +select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LPE'); + +-- check char classes' handling of newlines +select * from test_regex('\s+', E'abc \n def', 'LP'); +select * from test_regex('\s+', E'abc \n def', 'nLP'); +select * from test_regex('[\s]+', E'abc \n def', 'LPE'); +select * from test_regex('[\s]+', E'abc \n def', 'nLPE'); +select * from test_regex('\S+', E'abc\ndef', 'LP'); +select * from test_regex('\S+', E'abc\ndef', 'nLP'); +select * from test_regex('[\S]+', E'abc\ndef', 'LPE'); +select * from test_regex('[\S]+', E'abc\ndef', 'nLPE'); +select * from test_regex('\d+', E'012\n345', 'LP'); +select * from test_regex('\d+', E'012\n345', 'nLP'); +select * from test_regex('[\d]+', E'012\n345', 'LPE'); +select * from test_regex('[\d]+', E'012\n345', 'nLPE'); +select * from test_regex('\D+', E'abc\ndef345', 'LP'); +select * from test_regex('\D+', E'abc\ndef345', 'nLP'); +select * from test_regex('[\D]+', E'abc\ndef345', 'LPE'); +select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE'); +select * from test_regex('\w+', E'abc_012\ndef', 'LP'); +select * from test_regex('\w+', E'abc_012\ndef', 'nLP'); +select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE'); +select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE'); +select * from test_regex('\W+', E'***\n@@@___', 'LP'); +select * from test_regex('\W+', E'***\n@@@___', 'nLP'); +select * from test_regex('[\W]+', E'***\n@@@___', 'LPE'); +select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE'); + + -- doing 13 "escapes" -- expectError 13.1 & "a\\" EESCAPE
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 1f08bccb8b..cab276db45 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -6813,13 +6813,17 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); and bracket expressions using <literal>^</literal> will never match the newline character (so that matches will never cross newlines unless the RE - explicitly arranges it) + explicitly includes a newline) and <literal>^</literal> and <literal>$</literal> will match the empty string after and before a newline respectively, in addition to matching at beginning and end of string respectively. But the ARE escapes <literal>\A</literal> and <literal>\Z</literal> continue to match beginning or end of string <emphasis>only</emphasis>. + Also, the character class shorthands <literal>\D</literal> + and <literal>\W</literal> will match a newline regardless of this mode. + (Before <productname>PostgreSQL</productname> 14, they did not match + newlines in newline-sensitive mode.) </para> <para> diff --git a/src/backend/regex/re_syntax.n b/src/backend/regex/re_syntax.n index 1afaa7cce7..93830fd100 100644 --- a/src/backend/regex/re_syntax.n +++ b/src/backend/regex/re_syntax.n @@ -804,7 +804,7 @@ and bracket expressions using \fB^\fR will never match the newline character (so that matches will never cross newlines unless the RE -explicitly arranges it) +explicitly includes a newline) and \fB^\fR and @@ -817,6 +817,11 @@ ARE and \fB\eZ\fR continue to match beginning or end of string \fIonly\fR. +Also, the character class shorthands +\fB\eD\fR +and +\fB\eW\fR +will match a newline regardless of this mode. .PP If partial newline-sensitive matching is specified, this affects \fB.\fR diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 7b77a29136..d3540fdd0f 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -1407,10 +1407,6 @@ charclasscomplement(struct vars *v, /* build arcs for char class; this may cause color splitting */ subcolorcvec(v, cv, cstate, cstate); - - /* in NLSTOP mode, ensure newline is not part of the result set */ - if (v->cflags & REG_NLSTOP) - newarc(v->nfa, PLAIN, v->nlcolor, cstate, cstate); NOERR(); /* clean up any subcolors in the arc set */ @@ -1612,6 +1608,8 @@ cbracket(struct vars *v, NOERR(); bracket(v, left, right); + + /* in NLSTOP mode, ensure newline is not part of the result set */ if (v->cflags & REG_NLSTOP) newarc(v->nfa, PLAIN, v->nlcolor, left, right); NOERR(); diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out index 379e44f0a2..ef6206694f 100644 --- a/src/test/modules/test_regex/expected/test_regex.out +++ b/src/test/modules/test_regex/expected/test_regex.out @@ -2151,7 +2151,8 @@ select * from test_regex('\D+', E'abc\ndef345', 'nLP'); test_regex ------------------------------- {0,REG_UNONPOSIX,REG_ULOCALE} - {abc} + {"abc + + def"} (2 rows) select * from test_regex('[\D]+', E'abc\ndef345', 'LPE'); @@ -2166,7 +2167,8 @@ select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE'); test_regex ---------------------------------------- {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} - {abc} + {"abc + + def"} (2 rows) select * from test_regex('\w+', E'abc_012\ndef', 'LP'); @@ -2209,7 +2211,8 @@ select * from test_regex('\W+', E'***\n@@@___', 'nLP'); test_regex ------------------------------- {0,REG_UNONPOSIX,REG_ULOCALE} - {***} + {"*** + + @@@"} (2 rows) select * from test_regex('[\W]+', E'***\n@@@___', 'LPE'); @@ -2224,7 +2227,8 @@ select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE'); test_regex ---------------------------------------- {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} - {***} + {"*** + + @@@"} (2 rows) -- doing 13 "escapes"