* src/dfa.c: remove global dfa struct. A pointer to a struct dfa is instead added as a parameter to the functions that had been using the global. --- src/dfa.c | 207 +++++++++++++++++++++++++++++--------------------------------- 1 file changed, 98 insertions(+), 109 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c index d337bb6..5bd2a92 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -451,7 +451,7 @@ struct dfa #define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \ SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr) -static void regexp (void); +static void regexp (struct dfa *dfa); /* A table indexed by byte values that contains the corresponding wide character (if any) for that byte. WEOF means the byte is not a @@ -670,16 +670,6 @@ dfa_charclass_index (struct dfa *d, charclass const s) return i; } -/* A pointer to the current dfa is kept here during parsing. */ -static struct dfa *dfa; - -/* Find the index of charclass S in the current DFA, or allocate a new one. */ -static size_t -charclass_index (charclass const s) -{ - return dfa_charclass_index (dfa, s); -} - /* Syntax bits controlling the behavior of the lexical analyzer. */ static reg_syntax_t syntax_bits; static bool syntax_bits_set; @@ -807,7 +797,7 @@ using_utf8 (void) processed more efficiently. */ static bool -using_simple_locale (void) +using_simple_locale (struct dfa *dfa) { /* The native character set is known to be compatible with the C locale. The following test isn't perfect, but it's good @@ -870,7 +860,7 @@ static wint_t wctok; /* Wide character representation of the current of length 1); otherwise set WC to WEOF. If there is no more input, report EOFERR if EOFERR is not null, and return lasttok = END otherwise. */ -# define FETCH_WC(c, wc, eoferr) \ +# define FETCH_WC(dfa, c, wc, eoferr) \ do { \ if (! lexleft) \ { \ @@ -984,7 +974,7 @@ find_pred (const char *str) /* Multibyte character handling sub-routine for lex. Parse a bracket expression and build a struct mb_char_classes. */ static token -parse_bracket_exp (void) +parse_bracket_exp (struct dfa *dfa) { bool invert; int c, c1, c2; @@ -1028,12 +1018,12 @@ parse_bracket_exp (void) work_mbc = NULL; memset (ccl, 0, sizeof ccl); - FETCH_WC (c, wc, _("unbalanced [")); + FETCH_WC (dfa, c, wc, _("unbalanced [")); if (c == '^') { - FETCH_WC (c, wc, _("unbalanced [")); + FETCH_WC (dfa, c, wc, _("unbalanced [")); invert = true; - known_bracket_exp = using_simple_locale (); + known_bracket_exp = using_simple_locale (dfa); } else invert = false; @@ -1050,7 +1040,7 @@ parse_bracket_exp (void) dfa is ever called. */ if (c == '[') { - FETCH_WC (c1, wc1, _("unbalanced [")); + FETCH_WC (dfa, c1, wc1, _("unbalanced [")); if ((c1 == ':' && (syntax_bits & RE_CHAR_CLASSES)) || c1 == '.' || c1 == '=') @@ -1060,7 +1050,7 @@ parse_bracket_exp (void) size_t len = 0; for (;;) { - FETCH_WC (c, wc, _("unbalanced [")); + FETCH_WC (dfa, c, wc, _("unbalanced [")); if ((c == c1 && *lexptr == ']') || lexleft == 0) break; if (len < MAX_BRACKET_STRING_LEN) @@ -1072,7 +1062,7 @@ parse_bracket_exp (void) str[len] = '\0'; /* Fetch bracket. */ - FETCH_WC (c, wc, _("unbalanced [")); + FETCH_WC (dfa, c, wc, _("unbalanced [")); if (c1 == ':') /* Build character class. POSIX allows character classes to match multicharacter collating elements, @@ -1099,7 +1089,7 @@ parse_bracket_exp (void) colon_warning_state |= 8; /* Fetch new lookahead character. */ - FETCH_WC (c1, wc1, _("unbalanced [")); + FETCH_WC (dfa, c1, wc1, _("unbalanced [")); continue; } @@ -1108,15 +1098,15 @@ parse_bracket_exp (void) } if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) - FETCH_WC (c, wc, _("unbalanced [")); + FETCH_WC (dfa, c, wc, _("unbalanced [")); if (c1 == NOTCHAR) - FETCH_WC (c1, wc1, _("unbalanced [")); + FETCH_WC (dfa, c1, wc1, _("unbalanced [")); if (c1 == '-') /* build range characters. */ { - FETCH_WC (c2, wc2, _("unbalanced [")); + FETCH_WC (dfa, c2, wc2, _("unbalanced [")); /* A bracket expression like [a-[.aa.]] matches an unknown set. Treat it like [-a[.aa.]] while parsing it, and @@ -1137,17 +1127,17 @@ parse_bracket_exp (void) else { if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) - FETCH_WC (c2, wc2, _("unbalanced [")); + FETCH_WC (dfa, c2, wc2, _("unbalanced [")); colon_warning_state |= 8; - FETCH_WC (c1, wc1, _("unbalanced [")); + FETCH_WC (dfa, c1, wc1, _("unbalanced [")); /* Treat [x-y] as a range if x != y. */ if (wc != wc2 || wc == WEOF) { if (dfa->multibyte) known_bracket_exp = false; - else if (using_simple_locale ()) + else if (using_simple_locale (dfa)) { int ci; for (ci = c; ci <= c2; ci++) @@ -1214,7 +1204,7 @@ parse_bracket_exp (void) if (dfa->multibyte) { work_mbc->invert = invert; - work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (ccl); + work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl); return MBCSET; } @@ -1226,7 +1216,7 @@ parse_bracket_exp (void) clrbit ('\n', ccl); } - return CSET + charclass_index (ccl); + return CSET + dfa_charclass_index (dfa, ccl); } #define PUSH_LEX_STATE(s) \ @@ -1244,7 +1234,7 @@ parse_bracket_exp (void) while (false) static token -lex (void) +lex (struct dfa *dfa) { int c, c2; bool backslash = false; @@ -1259,7 +1249,7 @@ lex (void) "if (backslash) ...". */ for (i = 0; i < 2; ++i) { - FETCH_WC (c, wctok, NULL); + FETCH_WC (dfa, c, wctok, NULL); switch (c) { @@ -1472,7 +1462,7 @@ lex (void) if (syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', ccl); laststart = false; - return lasttok = CSET + charclass_index (ccl); + return lasttok = CSET + dfa_charclass_index (dfa, ccl); case 's': case 'S': @@ -1487,7 +1477,7 @@ lex (void) if (c == 'S') notset (ccl); laststart = false; - return lasttok = CSET + charclass_index (ccl); + return lasttok = CSET + dfa_charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1498,7 +1488,7 @@ lex (void) strings, each minus its "already processed" '['. */ PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]"); - lasttok = parse_bracket_exp (); + lasttok = parse_bracket_exp (dfa); POP_LEX_STATE (); @@ -1519,7 +1509,7 @@ lex (void) if (c == 'W') notset (ccl); laststart = false; - return lasttok = CSET + charclass_index (ccl); + return lasttok = CSET + dfa_charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1530,7 +1520,7 @@ lex (void) strings, each minus its "already processed" '['. */ PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]"); - lasttok = parse_bracket_exp (); + lasttok = parse_bracket_exp (dfa); POP_LEX_STATE (); @@ -1541,7 +1531,7 @@ lex (void) if (backslash) goto normal_char; laststart = false; - return lasttok = parse_bracket_exp (); + return lasttok = parse_bracket_exp (dfa); default: normal_char: @@ -1555,7 +1545,7 @@ lex (void) { zeroset (ccl); setbit_case_fold_c (c, ccl); - return lasttok = CSET + charclass_index (ccl); + return lasttok = CSET + dfa_charclass_index (dfa, ccl); } return lasttok = c; @@ -1578,7 +1568,7 @@ static size_t depth; /* Current depth of a hypothetical stack dfaanalyze. */ static void -addtok_mb (token t, int mbprop) +addtok_mb (struct dfa *dfa, token t, int mbprop) { if (dfa->talloc == dfa->tindex) { @@ -1618,12 +1608,12 @@ addtok_mb (token t, int mbprop) dfa->depth = depth; } -static void addtok_wc (wint_t wc); +static void addtok_wc (struct dfa *dfa, wint_t wc); /* Add the given token to the parse tree, maintaining the depth count and updating the maximum depth if necessary. */ static void -addtok (token t) +addtok (struct dfa *dfa, token t) { if (dfa->multibyte && t == MBCSET) { @@ -1635,9 +1625,9 @@ addtok (token t) This does not require UTF-8. */ for (i = 0; i < work_mbc->nchars; i++) { - addtok_wc (work_mbc->chars[i]); + addtok_wc (dfa, work_mbc->chars[i]); if (need_or) - addtok (OR); + addtok (dfa, OR); need_or = true; } work_mbc->nchars = 0; @@ -1646,14 +1636,14 @@ addtok (token t) that the mbcset is empty now. Do nothing in that case. */ if (work_mbc->cset != -1) { - addtok (CSET + work_mbc->cset); + addtok (dfa, CSET + work_mbc->cset); if (need_or) - addtok (OR); + addtok (dfa, OR); } } else { - addtok_mb (t, 3); + addtok_mb (dfa, t, 3); } } @@ -1664,7 +1654,7 @@ addtok (token t) <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT> <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */ static void -addtok_wc (wint_t wc) +addtok_wc (struct dfa *dfa, wint_t wc) { unsigned char buf[MB_LEN_MAX]; mbstate_t s = { 0 }; @@ -1681,16 +1671,16 @@ addtok_wc (wint_t wc) buf[0] = 0; } - addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1); + addtok_mb (dfa, buf[0], cur_mb_len == 1 ? 3 : 1); for (i = 1; i < cur_mb_len; i++) { - addtok_mb (buf[i], i == cur_mb_len - 1 ? 2 : 0); - addtok (CAT); + addtok_mb (dfa, buf[i], i == cur_mb_len - 1 ? 2 : 0); + addtok (dfa, CAT); } } static void -add_utf8_anychar (void) +add_utf8_anychar (struct dfa *dfa) { static charclass const utf8_classes[5] = { /* 80-bf: non-leading bytes. */ @@ -1724,7 +1714,7 @@ add_utf8_anychar (void) if (syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', c); } - dfa->utf8_anychar_classes[i] = CSET + charclass_index (c); + dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c); } /* A valid UTF-8 character is @@ -1738,12 +1728,12 @@ add_utf8_anychar (void) and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */ for (i = 1; i < n; i++) - addtok (dfa->utf8_anychar_classes[i]); + addtok (dfa, dfa->utf8_anychar_classes[i]); while (--i > 1) { - addtok (dfa->utf8_anychar_classes[0]); - addtok (CAT); - addtok (OR); + addtok (dfa, dfa->utf8_anychar_classes[0]); + addtok (dfa, CAT); + addtok (dfa, OR); } } @@ -1783,15 +1773,15 @@ add_utf8_anychar (void) The parser builds a parse tree in postfix form in an array of tokens. */ static void -atom (void) +atom (struct dfa *dfa) { if (tok == WCHAR) { if (wctok == WEOF) - addtok (BACKREF); + addtok (dfa, BACKREF); else { - addtok_wc (wctok); + addtok_wc (dfa, wctok); if (case_fold) { @@ -1799,13 +1789,13 @@ atom (void) unsigned int i, n = case_folded_counterparts (wctok, folded); for (i = 0; i < n; i++) { - addtok_wc (folded[i]); - addtok (OR); + addtok_wc (dfa, folded[i]); + addtok (dfa, OR); } } } - tok = lex (); + tok = lex (dfa); } else if (tok == ANYCHAR && using_utf8 ()) { @@ -1816,32 +1806,32 @@ atom (void) it is done above in add_utf8_anychar. So, let's start with UTF-8: it is the most used, and the structure of the encoding makes the correctness more obvious. */ - add_utf8_anychar (); - tok = lex (); + add_utf8_anychar (dfa); + tok = lex (dfa); } else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD || tok == ANYCHAR || tok == MBCSET || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) { - addtok (tok); - tok = lex (); + addtok (dfa, tok); + tok = lex (dfa); } else if (tok == LPAREN) { - tok = lex (); - regexp (); + tok = lex (dfa); + regexp (dfa); if (tok != RPAREN) dfaerror (_("unbalanced (")); - tok = lex (); + tok = lex (dfa); } else - addtok (EMPTY); + addtok (dfa, EMPTY); } /* Return the number of tokens in the given subexpression. */ static size_t _GL_ATTRIBUTE_PURE -nsubtoks (size_t tindex) +nsubtoks (struct dfa *dfa, size_t tindex) { size_t ntoks1; @@ -1852,90 +1842,90 @@ nsubtoks (size_t tindex) case QMARK: case STAR: case PLUS: - return 1 + nsubtoks (tindex - 1); + return 1 + nsubtoks (dfa, tindex - 1); case CAT: case OR: - ntoks1 = nsubtoks (tindex - 1); - return 1 + ntoks1 + nsubtoks (tindex - 1 - ntoks1); + ntoks1 = nsubtoks (dfa, tindex - 1); + return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1); } } /* Copy the given subexpression to the top of the tree. */ static void -copytoks (size_t tindex, size_t ntokens) +copytoks (struct dfa *dfa, size_t tindex, size_t ntokens) { size_t i; if (dfa->multibyte) for (i = 0; i < ntokens; ++i) - addtok_mb (dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]); + addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]); else for (i = 0; i < ntokens; ++i) - addtok_mb (dfa->tokens[tindex + i], 3); + addtok_mb (dfa, dfa->tokens[tindex + i], 3); } static void -closure (void) +closure (struct dfa *dfa) { int i; size_t tindex, ntokens; - atom (); + atom (dfa); while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) if (tok == REPMN && (minrep || maxrep)) { - ntokens = nsubtoks (dfa->tindex); + ntokens = nsubtoks (dfa, dfa->tindex); tindex = dfa->tindex - ntokens; if (maxrep < 0) - addtok (PLUS); + addtok (dfa, PLUS); if (minrep == 0) - addtok (QMARK); + addtok (dfa, QMARK); for (i = 1; i < minrep; ++i) { - copytoks (tindex, ntokens); - addtok (CAT); + copytoks (dfa, tindex, ntokens); + addtok (dfa, CAT); } for (; i < maxrep; ++i) { - copytoks (tindex, ntokens); - addtok (QMARK); - addtok (CAT); + copytoks (dfa, tindex, ntokens); + addtok (dfa, QMARK); + addtok (dfa, CAT); } - tok = lex (); + tok = lex (dfa); } else if (tok == REPMN) { - dfa->tindex -= nsubtoks (dfa->tindex); - tok = lex (); - closure (); + dfa->tindex -= nsubtoks (dfa, dfa->tindex); + tok = lex (dfa); + closure (dfa); } else { - addtok (tok); - tok = lex (); + addtok (dfa, tok); + tok = lex (dfa); } } static void -branch (void) +branch (struct dfa* dfa) { - closure (); + closure (dfa); while (tok != RPAREN && tok != OR && tok >= 0) { - closure (); - addtok (CAT); + closure (dfa); + addtok (dfa, CAT); } } static void -regexp (void) +regexp (struct dfa *dfa) { - branch (); + branch (dfa); while (tok == OR) { - tok = lex (); - branch (); - addtok (OR); + tok = lex (dfa); + branch (dfa); + addtok (dfa, OR); } } @@ -1945,13 +1935,12 @@ regexp (void) static void dfaparse (char const *s, size_t len, struct dfa *d) { - dfa = d; lexptr = s; lexleft = len; lasttok = END; laststart = true; parens = 0; - if (dfa->multibyte) + if (d->multibyte) { cur_mb_len = 0; memset (&d->mbs, 0, sizeof d->mbs); @@ -1960,19 +1949,19 @@ dfaparse (char const *s, size_t len, struct dfa *d) if (!syntax_bits_set) dfaerror (_("no syntax specified")); - tok = lex (); + tok = lex (d); depth = d->depth; - regexp (); + regexp (d); if (tok != END) dfaerror (_("unbalanced )")); - addtok (END - d->nregexps); - addtok (CAT); + addtok (d, END - d->nregexps); + addtok (d, CAT); if (d->nregexps) - addtok (OR); + addtok (d, OR); ++d->nregexps; } -- 2.8.1