Norihiro Tanaka wrote:
However, the patch adds an argument to dfasyntax(). To synchronize between grep and dfa easily, I expect it is applied before dfa is moved to gnulib.
Since we're already changing the DFA API already, how about if we have a flags arg that combines all these little Boolean arguments? That will make future changes less disruptive. Although the existing code supports any newline terminator, in practice only '\0' and '\n' are useful, so it's really just a boolean.
Most of the changes in your proposed patch are subsumed by the changes for multithreading, so the patch can be simplified now. I installed the attached, and plan to follow up shortly about the corresponding Gawk changes that I'll propose.
From 2ba4bc63bb7cf5f573b9aff929cf1e5cb045d683 Mon Sep 17 00:00:00 2001 From: Paul Eggert <egg...@cs.ucla.edu> Date: Fri, 2 Sep 2016 15:27:12 -0700 Subject: [PATCH] dfa: new option for anchored searches This follows up on a suggestion by Norihiro Tanaka (Bug#24262). * src/dfa.c (struct regex_syntax): New member 'anchor'. (char_context): Use it. (dfasyntax): Change signature to specify it, along with the old FOLD and EOL args, as a single DFAOPTS arg. All uses changed. * src/dfa.h (DFA_ANCHOR, DFA_CASE_FOLD, DFA_EOL_NUL): New constants for dfasyntax new last arg. --- src/dfa.c | 13 +++++++++---- src/dfa.h | 22 ++++++++++++++++++---- src/dfasearch.c | 4 +++- tests/dfa-match-aux.c | 2 +- 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 4cbaa75..ff3721c 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -335,6 +335,10 @@ struct regex_syntax /* Flag for case-folding letters into sets. */ bool case_fold; + /* True if ^ and $ match only the start and end of data, and do not match + end-of-line within data. */ + bool anchor; + /* End-of-line byte in data. */ unsigned char eolbyte; @@ -754,7 +758,7 @@ unibyte_word_constituent (struct dfa const *dfa, unsigned char c) static int char_context (struct dfa const *dfa, unsigned char c) { - if (c == dfa->syntax.eolbyte) + if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor) return CTX_NEWLINE; if (unibyte_word_constituent (dfa, c)) return CTX_LETTER; @@ -3987,7 +3991,7 @@ dfaalloc (void) /* Initialize DFA. */ void dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, - reg_syntax_t bits, bool fold, unsigned char eol) + reg_syntax_t bits, int dfaopts) { int i; memset (dfa, 0, offsetof (struct dfa, dfaexec)); @@ -4000,9 +4004,10 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, dfa->canychar = -1; dfa->lex.cur_mb_len = 1; dfa->syntax.syntax_bits_set = true; + dfa->syntax.case_fold = (dfaopts & DFA_CASE_FOLD) != 0; + dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0; + dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n'; dfa->syntax.syntax_bits = bits; - dfa->syntax.case_fold = fold; - dfa->syntax.eolbyte = eol; for (i = CHAR_MIN; i <= CHAR_MAX; ++i) { diff --git a/src/dfa.h b/src/dfa.h index 31baf7a..b8c44cc 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -46,15 +46,29 @@ struct dfa; calling dfafree() on it. */ extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC; +/* DFA options that can be ORed together, for dfasyntax's 4th arg. */ +enum + { + /* ^ and $ match only the start and end of data, and do not match + end-of-line within data. This is always false for grep, but + possibly true for other apps. */ + DFA_ANCHOR = 1 << 0, + + /* Ignore case while matching. */ + DFA_CASE_FOLD = 1 << 1, + + /* '\0' in data is end-of-line, instead of the traditional '\n'. */ + DFA_EOL_NUL = 1 << 2 + }; + /* Initialize or reinitialize a DFA. This must be called before any of the routines below. The arguments are: 1. The DFA to operate on. 2. Information about the current locale. - 3. The syntax bits described earlier in this file. - 4. The case-folding flag. - 5. The line terminator. */ + 3. Syntax bits described in regex.h. + 4. Additional DFA options described above. */ extern void dfasyntax (struct dfa *, struct localeinfo const *, - reg_syntax_t, bool, unsigned char); + reg_syntax_t, int); /* Build and return the struct dfamust from the given struct dfa. */ extern struct dfamust *dfamust (struct dfa const *); diff --git a/src/dfasearch.c b/src/dfasearch.c index 0838e1f..96be58f 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -123,7 +123,9 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) if (match_icase) syntax_bits |= RE_ICASE; re_set_syntax (syntax_bits); - dfasyntax (dfa, &localeinfo, syntax_bits, match_icase, eolbyte); + int dfaopts = ((match_icase ? DFA_CASE_FOLD : 0) + | (eolbyte ? 0 : DFA_EOL_NUL)); + dfasyntax (dfa, &localeinfo, syntax_bits, dfaopts); /* For GNU regex, pass the patterns separately to detect errors like "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and diff --git a/tests/dfa-match-aux.c b/tests/dfa-match-aux.c index e001b7d..070089c 100644 --- a/tests/dfa-match-aux.c +++ b/tests/dfa-match-aux.c @@ -58,7 +58,7 @@ main (int argc, char **argv) init_localeinfo (&localeinfo); dfa = dfaalloc (); - dfasyntax (dfa, &localeinfo, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n'); + dfasyntax (dfa, &localeinfo, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0); dfacomp (argv[1], strlen (argv[1]), dfa, 0); beg = argv[2]; -- 2.7.4