In comparing regcomp.c from gnulib with the one in glibc, I found numerous important differences:
- one bug that's been fixed in gnulib for 5 years, yet not in glibc. I reported it here: http://sourceware.org/bugzilla/show_bug.cgi?id=11127 - numerous bugs fixed in glibc but not in gnulib: regcomp: sync from glibc; always use nl_langinfo regcomp: sync from glibc: remove dead store regcomp: fix typo in comment regcomp: recognize ill-formed { } expressions regcomp: skip collseq lookup when there are no rules regcomp, regexec, fnmatch: avoid array bounds read error >From 9edb4055c90a2a29fdb80bc054bd2f4f234fef11 Mon Sep 17 00:00:00 2001 From: Jim Meyering <meyer...@redhat.com> Date: Mon, 4 Jan 2010 09:07:52 +0100 Subject: [PATCH 1/6] regcomp: sync from glibc; always use nl_langinfo * lib/regcomp.c (init_dfa) [!LIBC]: Always use nl_langinfo (CODESET), now that gnulib provides it. Recognize UTF8 as well as UTF-8. * modules/regex (Depends-on): Add nl_langinfo. --- ChangeLog | 7 +++++++ lib/regcomp.c | 7 ++++++- modules/regex | 1 + 3 files changed, 14 insertions(+), 1 deletions(-) diff --git a/ChangeLog b/ChangeLog index ecf7295..8d8b495 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2010-01-04 Jim Meyering <meyer...@redhat.com> + + regcomp: sync from glibc; always use nl_langinfo + * lib/regcomp.c (init_dfa) [!LIBC]: Always use nl_langinfo (CODESET), + now that gnulib provides it. Recognize UTF8 as well as UTF-8. + * modules/regex (Depends-on): Add nl_langinfo. + 2010-01-01 Bruno Haible <br...@clisp.org> wchar: Remove unused configure check. diff --git a/lib/regcomp.c b/lib/regcomp.c index 9745bea..3f21722 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -850,6 +850,9 @@ static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len) { __re_size_t table_size; +#ifndef _LIBC + char *codeset_name; +#endif #ifdef RE_ENABLE_I18N size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); #else @@ -893,7 +896,9 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) != 0); #else - if (strcmp (locale_charset (), "UTF-8") == 0) + codeset_name = nl_langinfo (CODESET); + if (strcasecmp (codeset_name, "UTF-8") == 0 + || strcasecmp (codeset_name, "UTF8") == 0) dfa->is_utf8 = 1; /* We check exhaustively in the loop below if this charset is a diff --git a/modules/regex b/modules/regex index c6a1235..f516406 100644 --- a/modules/regex +++ b/modules/regex @@ -22,6 +22,7 @@ memcmp memmove mbrtowc mbsinit +nl_langinfo stdbool stdint ssize_t -- 1.6.6.384.g14e6a >From 33e6868272f7fbbc86ec4685d48858c5fed74465 Mon Sep 17 00:00:00 2001 From: Jim Meyering <meyer...@redhat.com> Date: Mon, 4 Jan 2010 09:09:22 +0100 Subject: [PATCH 2/6] regcomp: sync from glibc: remove dead store * lib/regcomp.c (duplicate_node_closure): Remove useless search_duplicated_node call and dead store. --- ChangeLog | 4 ++++ lib/regcomp.c | 1 - 2 files changed, 4 insertions(+), 1 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8d8b495..0c59950 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2010-01-04 Jim Meyering <meyer...@redhat.com> + regcomp: sync from glibc: remove dead store + * lib/regcomp.c (duplicate_node_closure): Remove useless + search_duplicated_node call and dead store. + regcomp: sync from glibc; always use nl_langinfo * lib/regcomp.c (init_dfa) [!LIBC]: Always use nl_langinfo (CODESET), now that gnulib provides it. Recognize UTF8 as well as UTF-8. diff --git a/lib/regcomp.c b/lib/regcomp.c index 3f21722..0b900c5 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -1503,7 +1503,6 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, destination. */ org_dest = dfa->edests[org_node].elems[0]; re_node_set_empty (dfa->edests + clone_node); - clone_dest = search_duplicated_node (dfa, org_dest, constraint); /* If the node is root_node itself, it means the epsilon closure has a loop. Then tie it to the destination of the root_node. */ if (org_node == root_node && clone_node != org_node) -- 1.6.6.384.g14e6a >From 7701a216d0e01c2268658da019ae894a0db8b4f7 Mon Sep 17 00:00:00 2001 From: Jim Meyering <meyer...@redhat.com> Date: Mon, 4 Jan 2010 10:47:58 +0100 Subject: [PATCH 3/6] regcomp: fix typo in comment * lib/regcomp.c (duplicate_node_closure): Sync from glibc. s/satisfy/satisfies/. --- ChangeLog | 4 ++++ lib/regcomp.c | 2 +- 2 files changed, 5 insertions(+), 1 deletions(-) diff --git a/ChangeLog b/ChangeLog index 0c59950..d6364d9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2010-01-04 Jim Meyering <meyer...@redhat.com> + regcomp: fix typo in comment + * lib/regcomp.c (duplicate_node_closure): Sync from glibc. + s/satisfy/satisfies/. + regcomp: sync from glibc: remove dead store * lib/regcomp.c (duplicate_node_closure): Remove useless search_duplicated_node call and dead store. diff --git a/lib/regcomp.c b/lib/regcomp.c index 0b900c5..ae75e1a 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -1546,7 +1546,7 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, } else { - /* There is a duplicated node which satisfy the constraint, + /* There is a duplicated node which satisfies the constraint, use it to avoid infinite loop. */ ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (! ok, 0)) -- 1.6.6.384.g14e6a >From 2c8aba0e2b95b5639f55631c709bcb0bacdd091d Mon Sep 17 00:00:00 2001 From: Ulrich Drepper <drep...@redhat.com> Date: Mon, 4 Jan 2010 10:51:34 +0100 Subject: [PATCH 4/6] regcomp: recognize ill-formed { } expressions * lib/regcomp.c (parse_dup_op): From glibc: http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a87cd2894cb --- ChangeLog | 4 ++++ lib/regcomp.c | 3 ++- 2 files changed, 6 insertions(+), 1 deletions(-) diff --git a/ChangeLog b/ChangeLog index d6364d9..f6a1fa4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2010-01-04 Jim Meyering <meyer...@redhat.com> + regcomp: recognize ill-formed { } expressions + * lib/regcomp.c (parse_dup_op): From glibc: + http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a87cd2894cb + regcomp: fix typo in comment * lib/regcomp.c (duplicate_node_closure): Sync from glibc. s/satisfy/satisfies/. diff --git a/lib/regcomp.c b/lib/regcomp.c index ae75e1a..3082a74 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -2519,7 +2519,8 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, return elem; } - if (BE (end != REG_MISSING && start > end, 0)) + if (BE (end != REG_MISSING && start > end, 0) + || token->type != OP_CLOSE_DUP_NUM, 0)) { /* First number greater than second. */ *err = REG_BADBR; -- 1.6.6.384.g14e6a >From 2e07aaa8a3bd54444f2165030fec44fcd35516ca Mon Sep 17 00:00:00 2001 From: Ulrich Drepper <drep...@redhat.com> Date: Mon, 4 Jan 2010 10:59:51 +0100 Subject: [PATCH 5/6] regcomp: skip collseq lookup when there are no rules * lib/regcomp.c (lookup_collation_sequence_value): From glibc: http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a532a41df58 --- ChangeLog | 4 ++++ lib/regcomp.c | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index f6a1fa4..bc4d058 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2010-01-04 Jim Meyering <meyer...@redhat.com> + regcomp: skip collseq lookup when there are no rules + * lib/regcomp.c (lookup_collation_sequence_value): From glibc: + http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a532a41df58 + regcomp: recognize ill-formed { } expressions * lib/regcomp.c (parse_dup_op): From glibc: http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a87cd2894cb diff --git a/lib/regcomp.c b/lib/regcomp.c index 3082a74..7e300a0 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -2804,7 +2804,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, return elem; } - /* Local function for parse_bracket_exp used in _LIBC environement. + /* Local function for parse_bracket_exp used in _LIBC environment. Look up the collation sequence value of BR_ELEM. Return the value if succeeded, UINT_MAX otherwise. */ @@ -2828,7 +2828,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, } else if (br_elem->type == MB_CHAR) { - return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + if (nrules != 0) + return __collseq_table_lookup (collseqwc, br_elem->opr.wch); } else if (br_elem->type == COLL_SYM) { -- 1.6.6.384.g14e6a >From 9ed79744c1a5d6b8394c7931e2bfd2ecb09397ad Mon Sep 17 00:00:00 2001 From: Ulrich Drepper <drep...@redhat.com> Date: Mon, 4 Jan 2010 11:18:51 +0100 Subject: [PATCH 6/6] regcomp, regexec, fnmatch: avoid array bounds read error * lib/regcomp.c (build_equiv_class): From glibc: Use only the low 24 bits of a findidx return value as an index into the weights array. Patch by Ulrich Drepper: http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commit;h=b7d1c5fa30 * lib/regexec.c (check_node_accept_bytes): Likewise. * lib/fnmatch_loop.c (FCT): Likewise. --- ChangeLog | 8 ++++++++ lib/fnmatch_loop.c | 9 +++++++-- lib/regcomp.c | 10 +++++++--- lib/regexec.c | 11 ++++++++--- 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/ChangeLog b/ChangeLog index bc4d058..90e9af4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,13 @@ 2010-01-04 Jim Meyering <meyer...@redhat.com> + regcomp, regexec, fnmatch: avoid array bounds read error + * lib/regcomp.c (build_equiv_class): From glibc: + Use only the low 24 bits of a findidx return value as an index + into the weights array. Patch by Ulrich Drepper: + http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commit;h=b7d1c5fa30 + * lib/regexec.c (check_node_accept_bytes): Likewise. + * lib/fnmatch_loop.c (FCT): Likewise. + regcomp: skip collseq lookup when there are no rules * lib/regcomp.c (lookup_collation_sequence_value): From glibc: http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a532a41df58 diff --git a/lib/fnmatch_loop.c b/lib/fnmatch_loop.c index bb24904..8cd4444 100644 --- a/lib/fnmatch_loop.c +++ b/lib/fnmatch_loop.c @@ -382,15 +382,20 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end, /* We found a table entry. Now see whether the character we are currently at has the same equivalance class value. */ - int len = weights[idx]; + int len = weights[idx & 0xffffff]; int32_t idx2; const UCHAR *np = (const UCHAR *) n; idx2 = findidx (&np); - if (idx2 != 0 && len == weights[idx2]) + if (idx2 != 0 + && (idx >> 24) == (idx2 >> 24) + && len == weights[idx2 & 0xffffff]) { int cnt = 0; + idx &= 0xffffff; + idx2 &= 0xffffff; + while (cnt < len && (weights[idx + 1 + cnt] == weights[idx2 + 1 + cnt])) diff --git a/lib/regcomp.c b/lib/regcomp.c index 7e300a0..d7ee6f3 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -3436,7 +3436,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) /* Build single byte matcing table for this equivalence class. */ char_buf[1] = (unsigned char) '\0'; - len = weights[idx1]; + len = weights[idx1 & 0xffffff]; for (ch = 0; ch < SBC_MAX; ++ch) { char_buf[0] = ch; @@ -3448,11 +3448,15 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) if (idx2 == 0) /* This isn't a valid character. */ continue; - if (len == weights[idx2]) + /* Compare only if the length matches and the collation rule + index is the same. */ + if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) { int cnt = 0; + while (cnt <= len && - weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) + weights[(idx1 & 0xffffff) + 1 + cnt] + == weights[(idx2 & 0xffffff) + 1 + cnt]) ++cnt; if (cnt > len) diff --git a/lib/regexec.c b/lib/regexec.c index 0d48a95..05979b1 100644 --- a/lib/regexec.c +++ b/lib/regexec.c @@ -3949,15 +3949,20 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); - idx = findidx (&cp); + int32_t idx = findidx (&cp); if (idx > 0) for (i = 0; i < cset->nequiv_classes; ++i) { int32_t equiv_class_idx = cset->equiv_classes[i]; - size_t weight_len = weights[idx]; - if (weight_len == weights[equiv_class_idx]) + size_t weight_len = weights[idx & 0xffffff]; + if (weight_len == weights[equiv_class_idx & 0xffffff] + && (idx >> 24) == (equiv_class_idx >> 24)) { Idx cnt = 0; + + idx &= 0xffffff; + equiv_class_idx &= 0xffffff; + while (cnt <= weight_len && (weights[equiv_class_idx + 1 + cnt] == weights[idx + 1 + cnt])) -- 1.6.6.384.g14e6a