bug#20974: Weird newline matching behaviour in --null-data mode

Paul Eggert Sat, 04 Jul 2015 08:51:39 -0700

Norihiro Tanaka wrote:

Not only '.' but also hat
list (e.g. [^a]) should match newline with -z.  So we need clear
RE_HAT_LISTS_NOT_NEWLINE bit.

Thanks for reporting that. I also noticed some related bugs in dfa.c that'grep' does not exercise (so no grep test cases, alas). Plus, it's long beentime that we fix RE_SYNTAX_GREP and RE_SYNTAX_EGREP to match grep's actualbehavior. So I installed a Gnulib patch to update RE_SYNTAX_GREP andRE_SYNTAX_EGREP to the fixed behavior (see<http://lists.gnu.org/archive/html/bug-gnulib/2015-07/msg00016.html>) andinstalled grep patches to sync to gnulib and fix the other problems.

The first attached patch I installed yesterday (and you've commented on it) butI didn't have time to send email about it so am attaching it now. The otherfive attached patches fix the bugs noted above.

Here's the justification for the first attached patch. The grep documentationsays that '.' matches any character, and this includes both NUL and LF.Ordinarily, LF terminates a line and so is never part of match data, but '.'should still match NUL. Conversely with -z, NUL terminates a line and so isnever part of match data, but '.' should still match LF.

>From 0e8fda0d880cccd0e1997a905eb9a7910f957245 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 3 Jul 2015 18:23:59 -0700
Subject: [PATCH 1/6] grep: -z '.' now consistently matches newline

Problem reported by Balazs Kezes in: http://bugs.gnu.org/20974
* NEWS: Document this.
* tests/utf8-bracket: New file, to test for this bug.
* src/grep.c (Gcompile, Ecompile): Also specify RE_DOT_NEWLINE.
* tests/Makefile.am (TESTS): Add it.
---
 NEWS               |  4 ++++
 src/grep.c         |  7 ++++---
 tests/Makefile.am  |  1 +
 tests/utf8-bracket | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 43 insertions(+), 3 deletions(-)
 create mode 100755 tests/utf8-bracket

diff --git a/NEWS b/NEWS
index bbbe893..88ed0f4 100644
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,10 @@ GNU grep NEWS                                    -*- outline -*-
   grep no longer reads from uninitialized memory or from beyond the end
   of the heap-allocated input buffer.  This fix addressed CVE-2015-1345.
 
+  With -z, '.' in a pattern now consistently matches newline.
+  Previously, it sometimes matched newline, and sometimes did not.
+  [bug introduced in grep-2.4]
+
   When the JIT stack is exhausted, grep -P now grows the stack rather
   than reporting an internal PCRE error.
 
diff --git a/src/grep.c b/src/grep.c
index 778dbcb..ed54dc2 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1861,15 +1861,16 @@ if any error occurs and -q is not given, the exit status is 2.\n"));
 static void
 Gcompile (char const *pattern, size_t size)
 {
-  GEAcompile (pattern, size, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES);
+  GEAcompile (pattern, size,
+              RE_SYNTAX_GREP | RE_DOT_NEWLINE | RE_NO_EMPTY_RANGES);
 }
 
 static void
 Ecompile (char const *pattern, size_t size)
 {
   GEAcompile (pattern, size,
-              (RE_SYNTAX_POSIX_EGREP | RE_NO_EMPTY_RANGES
-               | RE_UNMATCHED_RIGHT_PAREN_ORD));
+              (RE_SYNTAX_POSIX_EGREP | RE_DOT_NEWLINE
+               | RE_NO_EMPTY_RANGES | RE_UNMATCHED_RIGHT_PAREN_ORD));
 }
 
 static void
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 7bceac7..629d322 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -120,6 +120,7 @@ TESTS =						\
   two-files					\
   unibyte-bracket-expr				\
   unibyte-negated-circumflex			\
+  utf8-bracket					\
   warn-char-classes				\
   word-delim-multibyte				\
   word-multi-file				\
diff --git a/tests/utf8-bracket b/tests/utf8-bracket
new file mode 100755
index 0000000..f5c4a60
--- /dev/null
+++ b/tests/utf8-bracket
@@ -0,0 +1,34 @@
+#!/bin/sh
+# Check bracket expressions in a UTF-8 locale.
+
+# Copyright 2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_en_utf8_locale_
+
+printf '1\n2\n' >in || framework_failure_
+
+fail=0
+
+for locale in C en_US.UTF-8; do
+  for pattern in '1.2' '[12].2' '[1-2].2'; do
+    for suffix in '' '\(\)\1'; do
+      LC_ALL=$locale grep --null-data --quiet "$pattern$suffix" in || fail=1
+    done
+  done
+done
+
+Exit $fail
-- 
2.1.0

>From 4629191813aca41fa65cfba698683e662e41a5a5 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 4 Jul 2015 07:06:50 -0700
Subject: [PATCH 2/6] grep: -z '[^x]' now consistently matches newline

Problem reported by Norihiro Tanaka in: http://bugs.gnu.org/20974#19
* NEWS: Document this.
* src/grep.c (Gcompile, Ecompile): Clear RE_HAT_LISTS_NOT_NEWLINE.
* tests/utf8-bracket: Test this.
---
 NEWS               |  4 ++--
 src/grep.c         |  8 +++++---
 tests/utf8-bracket | 12 +++++++++---
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/NEWS b/NEWS
index 88ed0f4..35c4aad 100644
--- a/NEWS
+++ b/NEWS
@@ -12,8 +12,8 @@ GNU grep NEWS                                    -*- outline -*-
   grep no longer reads from uninitialized memory or from beyond the end
   of the heap-allocated input buffer.  This fix addressed CVE-2015-1345.
 
-  With -z, '.' in a pattern now consistently matches newline.
-  Previously, it sometimes matched newline, and sometimes did not.
+  With -z, '.' and '[^x]' in a pattern now consistently match newline.
+  Previously, they sometimes matched newline, and sometimes did not.
   [bug introduced in grep-2.4]
 
   When the JIT stack is exhausted, grep -P now grows the stack rather
diff --git a/src/grep.c b/src/grep.c
index ed54dc2..9b38cf5 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1862,15 +1862,17 @@ static void
 Gcompile (char const *pattern, size_t size)
 {
   GEAcompile (pattern, size,
-              RE_SYNTAX_GREP | RE_DOT_NEWLINE | RE_NO_EMPTY_RANGES);
+              ((RE_SYNTAX_GREP | RE_DOT_NEWLINE | RE_NO_EMPTY_RANGES)
+               & ~RE_HAT_LISTS_NOT_NEWLINE));
 }
 
 static void
 Ecompile (char const *pattern, size_t size)
 {
   GEAcompile (pattern, size,
-              (RE_SYNTAX_POSIX_EGREP | RE_DOT_NEWLINE
-               | RE_NO_EMPTY_RANGES | RE_UNMATCHED_RIGHT_PAREN_ORD));
+              ((RE_SYNTAX_POSIX_EGREP | RE_DOT_NEWLINE
+                | RE_NO_EMPTY_RANGES | RE_UNMATCHED_RIGHT_PAREN_ORD)
+               & ~RE_HAT_LISTS_NOT_NEWLINE));
 }
 
 static void
diff --git a/tests/utf8-bracket b/tests/utf8-bracket
index f5c4a60..b63afbb 100755
--- a/tests/utf8-bracket
+++ b/tests/utf8-bracket
@@ -24,9 +24,15 @@ printf '1\n2\n' >in || framework_failure_
 fail=0
 
 for locale in C en_US.UTF-8; do
-  for pattern in '1.2' '[12].2' '[1-2].2'; do
-    for suffix in '' '\(\)\1'; do
-      LC_ALL=$locale grep --null-data --quiet "$pattern$suffix" in || fail=1
+  for options in -qz -qzE; do
+    case $options in
+      *E*) parens='()';;
+      *) parens='\(\)';;
+    esac
+    for pattern in '1.2' '[12].2' '[1-2].2' '[1-2][^a][1-2]'; do
+      for suffix in '' "$parens\\1"; do
+        LC_ALL=$locale grep $options "$pattern$suffix" in || fail=1
+      done
     done
   done
 done
-- 
2.1.0

>From 66521f5b70559b58498d47a0afb92b174f12d78f Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 4 Jul 2015 07:16:32 -0700
Subject: [PATCH 3/6] dfa: '.' and '[^x]' now consistently match newline

* src/dfa.c (parse_bracket_exp, lex, add_utf8_anychar)
(match_anychar): RE_DOT_NEWLINE and RE_HAT_LISTS_NOT_NEWLINE
are about LF, not about eolbyte.  This patch does not affect
'grep', but may affect other users of dfa.c.
---
 src/dfa.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 8901f69..c7b659e 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1242,7 +1242,7 @@ parse_bracket_exp (void)
       assert (!dfa->multibyte);
       notset (ccl);
       if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
-        clrbit (eolbyte, ccl);
+        clrbit ('\n', ccl);
     }
 
   return CSET + charclass_index (ccl);
@@ -1487,7 +1487,7 @@ lex (void)
           zeroset (ccl);
           notset (ccl);
           if (!(syntax_bits & RE_DOT_NEWLINE))
-            clrbit (eolbyte, ccl);
+            clrbit ('\n', ccl);
           if (syntax_bits & RE_DOT_NOT_NULL)
             clrbit ('\0', ccl);
           laststart = false;
@@ -1759,7 +1759,7 @@ add_utf8_anychar (void)
         if (i == 1)
           {
             if (!(syntax_bits & RE_DOT_NEWLINE))
-              clrbit (eolbyte, c);
+              clrbit ('\n', c);
             if (syntax_bits & RE_DOT_NOT_NULL)
               clrbit ('\0', c);
           }
@@ -2991,7 +2991,7 @@ match_anychar (struct dfa *d, state_num s, position pos,
   int context;
 
   /* Check syntax bits.  */
-  if (wc == (wchar_t) eolbyte)
+  if (wc == (wchar_t) '\n')
     {
       if (!(syntax_bits & RE_DOT_NEWLINE))
         return 0;
-- 
2.1.0

>From 4ced7bc93f152981cebc8a001f7f1f6b031fec78 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 4 Jul 2015 08:28:09 -0700
Subject: [PATCH 4/6] build: update gnulib submodule to latest

---
 gnulib | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnulib b/gnulib
index aecd387..5a5a938 160000
--- a/gnulib
+++ b/gnulib
@@ -1 +1 @@
-Subproject commit aecd38787af5ca0000e184912194e8c83123eb7f
+Subproject commit 5a5a9388e93d00a7bcb97700a7a552bef20343fd
-- 
2.1.0

>From 89ba3a292ab3fc76a9293c49f2a40953948eaac4 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 4 Jul 2015 08:33:54 -0700
Subject: [PATCH 5/6] maint: ignore gendocs_template_min

* doc/.gitignore: Add '/gendocs_template_min'.
---
 doc/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/.gitignore b/doc/.gitignore
index a059eae..ac147d2 100644
--- a/doc/.gitignore
+++ b/doc/.gitignore
@@ -2,6 +2,7 @@
 /fdl.texi
 /fgrep.1
 /gendocs_template
+/gendocs_template_min
 /grep.info*
 /stamp-vti
 /version.texi
-- 
2.1.0

>From c279a25680129b5525a6ee0f7112f5fb0061ce96 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 4 Jul 2015 08:35:58 -0700
Subject: [PATCH 6/6] grep: use recent gnulib syntax bits

* src/grep.c (Gcompile, Ecompile): Use plain RE_SYNTAX_GREP
and RE_SYNTAX_EGREP, now that we assume a recent-enough gnulib.
---
 src/grep.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/grep.c b/src/grep.c
index 9b38cf5..a735ea5 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1861,18 +1861,13 @@ if any error occurs and -q is not given, the exit status is 2.\n"));
 static void
 Gcompile (char const *pattern, size_t size)
 {
-  GEAcompile (pattern, size,
-              ((RE_SYNTAX_GREP | RE_DOT_NEWLINE | RE_NO_EMPTY_RANGES)
-               & ~RE_HAT_LISTS_NOT_NEWLINE));
+  GEAcompile (pattern, size, RE_SYNTAX_GREP);
 }
 
 static void
 Ecompile (char const *pattern, size_t size)
 {
-  GEAcompile (pattern, size,
-              ((RE_SYNTAX_POSIX_EGREP | RE_DOT_NEWLINE
-                | RE_NO_EMPTY_RANGES | RE_UNMATCHED_RIGHT_PAREN_ORD)
-               & ~RE_HAT_LISTS_NOT_NEWLINE));
+  GEAcompile (pattern, size, RE_SYNTAX_EGREP);
 }
 
 static void
-- 
2.1.0

bug#20974: Weird newline matching behaviour in --null-data mode

Reply via email to