bug#16895: [PATCH] grep: fix multiple bugs with bracket expressions

Paul Eggert Thu, 27 Feb 2014 09:35:28 -0800

Tags: patch done

I'm afraid there are several problems in the dfa code. I still don'thave a handle on all of them, but here's my first patch to deal with thefirst major one I found. Patterns like [a-[.z.]], which caused 'grep'to dump core until recently, still aren't being handled correctly, andthere are several closely related bugs here. I've taken the liberty ofpushing the attached patch.

>From f11f0c9351fdd2bd65efdb469754096d1a237d61 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Thu, 27 Feb 2014 09:26:23 -0800
Subject: [PATCH] grep: fix multiple bugs with bracket expressions


* NEWS: Document this.
* src/dfa.c (using_simple_locale): New function.
(parse_bracket_exp): Handle bracket expressions like [a-[.z.]]
correctly.  Don't assume that dfaexec handles expressions like
[^a-z] correctly, as they can match multiple characters in some
locales.
* tests/posix-bracket: New file.
* tests/Makefile.am (TESTS): Add it.
---
 NEWS                |   4 ++
 src/dfa.c           | 129 +++++++++++++++++++++++++++++-----------------------
 tests/Makefile.am   |   1 +
 tests/posix-bracket |  33 ++++++++++++++
 4 files changed, 110 insertions(+), 57 deletions(-)
 create mode 100755 tests/posix-bracket

diff --git a/NEWS b/NEWS
index 657f3d1..6cfcaba 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,10 @@ GNU grep NEWS                                    -*- outline -*-
 
 ** Bug fixes
 
+  grep no longer mishandles patterns like [a-[.z.]], and no longer
+  mishandles patterns like [^a] in locales that have multicharacter
+  collating sequences so that [^a] can match a string of two characters.
+
   grep -P now works with -w and -x and backreferences. Before,
   echo aa|grep -Pw '(.)\1' would fail to match, yet
   echo aa|grep -Pw '(.)\2' would match.
diff --git a/src/dfa.c b/src/dfa.c
index 8906ed3..65ab5d6 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -182,7 +182,8 @@ enum
   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
                                    the empty string.  */
 
-  BACKREF,                      /* BACKREF is generated by \<digit>; it
+  BACKREF,                      /* BACKREF is generated by \<digit>
+                                   or by any other construct that
                                    is not completely handled.  If the scanner
                                    detects a transition on backref, it returns
                                    a kind of "semi-success" indicating that
@@ -769,6 +770,45 @@ using_utf8 (void)
   return utf8;
 }
 
+/* Return true if the current locale is known to be a unibyte locale
+   without multicharacter collating sequences and where range
+   comparisons simply use the native encoding.  These locales can be
+   processed more efficiently.  */
+
+static bool
+using_simple_locale (void)
+{
+  /* True if the native character set is known to be compatible with
+     the C locale.  The following test isn't perfect, but it's good
+     enough in practice, as only ASCII and EBCDIC are in common use
+     and this test correctly accepts ASCII and rejects EBCDIC.  */
+  enum { native_c_charset =
+    ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
+     && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
+     && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
+     && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
+     && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
+     && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
+     && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
+     && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
+     && '}' == 125 && '~' == 126)
+  };
+
+  if (! native_c_charset || MB_CUR_MAX > 1)
+    return false;
+  else
+    {
+      static int unibyte_c = -1;
+      if (unibyte_c < 0)
+        {
+          char *locale = setlocale (LC_ALL, 0);
+          unibyte_c = (locale && (STREQ (locale, "C")
+                                  || STREQ (locale, "POSIX")));
+        }
+      return unibyte_c;
+    }
+}
+
 /* Lexical analyzer.  All the dross that deals with the obnoxious
    GNU Regex syntax bits is located here.  The poor, suffering
    reader is referred to the GNU Regex documentation for the
@@ -917,6 +957,10 @@ parse_bracket_exp (void)
   int c, c1, c2;
   charclass ccl;
 
+  /* True if this is a bracket expression that dfaexec is known to
+     process correctly.  */
+  bool known_bracket_exp = true;
+
   /* Used to warn about [:space:].
      Bit 0 = first character is a colon.
      Bit 1 = last character is a colon.
@@ -958,6 +1002,7 @@ parse_bracket_exp (void)
     {
       FETCH_WC (c, wc, _("unbalanced ["));
       invert = 1;
+      known_bracket_exp = using_simple_locale ();
     }
   else
     invert = 0;
@@ -972,16 +1017,14 @@ parse_bracket_exp (void)
          we just treat it as a bunch of ordinary characters.  We can do
          this because we assume regex has checked for syntax errors before
          dfa is ever called.  */
-      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
+      if (c == '[')
         {
 #define MAX_BRACKET_STRING_LEN 32
           char str[MAX_BRACKET_STRING_LEN + 1];
           FETCH_WC (c1, wc1, _("unbalanced ["));
 
-          /* If pattern contains '[[:', '[[.', or '[[='.  */
-          if (c1 == ':'
-              /* TODO: handle '[[.' and '[[=' also for MB_CUR_MAX == 1.  */
-              || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '=')))
+          if ((c1 == ':' && syntax_bits & RE_CHAR_CLASSES)
+              || c1 == '.' || c1 == '=')
             {
               size_t len = 0;
               for (;;)
@@ -1000,7 +1043,10 @@ parse_bracket_exp (void)
               /* Fetch bracket.  */
               FETCH_WC (c, wc, _("unbalanced ["));
               if (c1 == ':')
-                /* build character class.  */
+                /* Build character class.  POSIX allows character
+                   classes to match multicharacter collating elements,
+                   but the regex code does not support that, so do not
+                   worry about that possibility.  */
                 {
                   char const *class
                     = (case_fold && (STREQ (str, "upper")
@@ -1024,28 +1070,9 @@ parse_bracket_exp (void)
                     if (pred->func (c2))
                       setbit_case_fold_c (c2, ccl);
                 }
+              else
+                known_bracket_exp = false;
 
-              else if (MBS_SUPPORT && (c1 == '=' || c1 == '.'))
-                {
-                  char *elem = xmemdup (str, len + 1);
-
-                  if (c1 == '=')
-                    /* build equivalence class.  */
-                    {
-                      REALLOC_IF_NECESSARY (work_mbc->equivs,
-                                            equivs_al, work_mbc->nequivs + 1);
-                      work_mbc->equivs[work_mbc->nequivs++] = elem;
-                    }
-
-                  if (c1 == '.')
-                    /* build collating element.  */
-                    {
-                      REALLOC_IF_NECESSARY (work_mbc->coll_elems,
-                                            coll_elems_al,
-                                            work_mbc->ncoll_elems + 1);
-                      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
-                    }
-                }
               colon_warning_state |= 8;
 
               /* Fetch new lookahead character.  */
@@ -1067,6 +1094,16 @@ parse_bracket_exp (void)
         /* build range characters.  */
         {
           FETCH_WC (c2, wc2, _("unbalanced ["));
+
+          /* A bracket expression like [a-[.aa.]] matches an unknown set.
+             Treat it like [-a[.aa.]] while parsing it, and
+             remember that the set is unknown.  */
+          if (c2 == '[' && *lexptr == '.')
+            {
+              known_bracket_exp = false;
+              c2 = ']';
+            }
+
           if (c2 == ']')
             {
               /* In the case [x-], the - is an ordinary hyphen,
@@ -1104,36 +1141,11 @@ parse_bracket_exp (void)
                   work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
                 }
             }
+          else if (using_simple_locale ())
+            for (; c <= c2; c++)
+              setbit_case_fold_c (c, ccl);
           else
-            {
-              /* Defer to the system regex library about the meaning
-                 of range expressions.  */
-              struct re_pattern_buffer re = { 0 };
-              char const *compile_msg;
-#if 199901 <= __STDC_VERSION__
-              char pattern[] = { '[', '\\', c, '-', '\\', c2, ']' };
-#else
-              char pattern[] = { '[', '\\', 0, '-', '\\', 0, ']' };
-              pattern[2] = c;
-              pattern[5] = c2;
-#endif
-              re_set_syntax (syntax_bits | RE_BACKSLASH_ESCAPE_IN_LISTS);
-              compile_msg = re_compile_pattern (pattern, sizeof pattern, &re);
-              if (compile_msg)
-                dfaerror (compile_msg);
-              for (c = 0; c < NOTCHAR; c++)
-                {
-                  char subject = c;
-                  switch (re_match (&re, &subject, 1, 0, NULL))
-                    {
-                    case 1: setbit (c, ccl); break;
-                    case -1: break;
-                    default: xalloc_die ();
-                    }
-                }
-              regfree (&re);
-              re_set_syntax (syntax_bits);
-            }
+            known_bracket_exp = false;
 
           colon_warning_state |= 8;
           FETCH_WC (c1, wc1, _("unbalanced ["));
@@ -1171,6 +1183,9 @@ parse_bracket_exp (void)
   if (colon_warning_state == 7)
     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
 
+  if (! known_bracket_exp)
+    return BACKREF;
+
   if (MB_CUR_MAX > 1)
     {
       static charclass zeroclass;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 742a580..972ffc5 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -86,6 +86,7 @@ TESTS =						\
   pcre-w					\
   pcre-wx-backref				\
   pcre-z					\
+  posix-bracket					\
   prefix-of-multibyte				\
   r-dot						\
   repetition-overflow				\
diff --git a/tests/posix-bracket b/tests/posix-bracket
new file mode 100755
index 0000000..d9d1d84
--- /dev/null
+++ b/tests/posix-bracket
@@ -0,0 +1,33 @@
+#!/bin/sh
+# Check various bracket expressions in the POSIX locale.
+
+# Copyright 2014 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+LC_ALL=C
+export LC_ALL
+
+fail=0
+
+echo a >in || framework_failure_
+for bracketed in '[.a.]' '[.a.]-a' 'a-[.a.]' '[.a.]-[.a.]' \
+    '[=a=]' '[:alpha:]'; do
+  grep "[$bracketed]" in >out || fail=1
+  compare in out || fail=1
+  grep "[^$bracketed]" in >out && fail=1
+  compare /dev/null out || fail=1
+done
+Exit $fail
-- 
1.8.5.3

bug#16895: [PATCH] grep: fix multiple bugs with bracket expressions

Reply via email to