Hello!

I've bisected the upstream git repo v2.22..v2.23 and found that
the issue is caused by commit 40ed879db22d57516a31fefd1c39416974b74ec4
"grep: fix bug with with invalid unibyte sequence"

I've verified that using v2.23 with only the above commit reverted
avoids the problem (see attached patch for my exact conflict resolution).

I also tested building the debian grep package with the patch attached.
As expected that also avoids the issue. Attached debdiff.

I hope this can be used as an interim fix to avoid pkg-gnome packages
loosing Uploaders information until someone with deeper knowledge can
dive into whats causing the problem in the upstream change.

Regards,
Andreas Henriksson
>From 3a3a38e0a8013b4d1058079a519bd0596b98b42b Mon Sep 17 00:00:00 2001
From: Andreas Henriksson <andreas.henriks...@endian.se>
Date: Wed, 17 Feb 2016 15:29:07 +0100
Subject: [PATCH] Revert "grep: fix bug with with invalid unibyte sequence"

This reverts commit 40ed879db22d57516a31fefd1c39416974b74ec4.

Conflicts:
	src/grep.c
	tests/unibyte-binary
---
 src/grep.c           | 70 ++++++++++++++++++++++------------------------------
 tests/Makefile.am    |  1 -
 tests/pcre-z         |  9 +++----
 tests/unibyte-binary | 32 ------------------------
 4 files changed, 34 insertions(+), 78 deletions(-)
 delete mode 100755 tests/unibyte-binary

diff --git a/src/grep.c b/src/grep.c
index 73c3651..be7d201 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -484,6 +484,21 @@ clean_up_stdout (void)
     close_stdout ();
 }
 
+/* The high-order bit of a byte.  */
+enum { HIBYTE = 0x80 };
+
+/* True if every byte with HIBYTE off is a single-byte character.
+   UTF-8 has this property.  */
+static bool easy_encoding;
+
+static void
+init_easy_encoding (void)
+{
+  easy_encoding = true;
+  for (int i = 0; i < HIBYTE; i++)
+    easy_encoding &= mbclen_cache[i] == 1;
+}
+
 /* A cast to TYPE of VAL.  Use this when TYPE is a pointer type, VAL
    is properly aligned for TYPE, and 'gcc -Wcast-align' cannot infer
    the alignment and would otherwise complain about the cast.  */
@@ -502,46 +517,21 @@ clean_up_stdout (void)
 /* An unsigned type suitable for fast matching.  */
 typedef uintmax_t uword;
 
-/* A mask to test for unibyte characters, with the pattern repeated to
-   fill a uword.  For a multibyte character encoding where
-   all bytes are unibyte characters, this is 0.  For UTF-8, this is
-   0x808080....  For encodings where unibyte characters have no discerned
-   pattern, this is all 1s.  The unsigned char C is a unibyte
-   character if C & UNIBYTE_MASK is zero.  If the uword W is the
-   concatenation of bytes, the bytes are all unibyte characters
-   if W & UNIBYTE_MASK is zero.  */
-static uword unibyte_mask;
-
-static void
-initialize_unibyte_mask (void)
-{
-  /* For each encoding error I that MASK does not already match,
-     accumulate I's most significant 1 bit by ORing it into MASK.
-     Although any 1 bit of I could be used, in practice high-order
-     bits work better.  */
-  unsigned char mask = 0;
-  int ms1b = 1;
-  for (int i = 1; i <= UCHAR_MAX; i++)
-    if (mbclen_cache[i] != 1 && ! (mask & i))
-      {
-        while (ms1b * 2 <= i)
-          ms1b *= 2;
-        mask |= ms1b;
-      }
-
-  /* Now MASK will detect any encoding-error byte, although it may
-     cry wolf and it may not be optimal.  Build a uword-length mask by
-     repeating MASK.  */
-  uword uword_max = -1;
-  unibyte_mask = uword_max / UCHAR_MAX * mask;
-}
-
 /* Skip the easy bytes in a buffer that is guaranteed to have a sentinel
    that is not easy, and return a pointer to the first non-easy byte.
-   The easy bytes all have UNIBYTE_MASK off.  */
+   In easy encodings, the easy bytes all have HIBYTE off.
+   In other encodings, no byte is easy.  */
 static char const * _GL_ATTRIBUTE_PURE
 skip_easy_bytes (char const *buf)
 {
+  if (!easy_encoding)
+    return buf;
+
+  uword uword_max = -1;
+
+  /* 0x8080..., extended to be wide enough for uword.  */
+  uword hibyte_mask = uword_max / UCHAR_MAX * HIBYTE;
+
   /* Search a byte at a time until the pointer is aligned, then a
      uword at a time until a match is found, then a byte at a time to
      identify the exact byte.  The uword search may go slightly past
@@ -549,11 +539,11 @@ skip_easy_bytes (char const *buf)
   char const *p;
   uword const *s;
   for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++)
-    if (to_uchar (*p) & unibyte_mask)
+    if (*p & HIBYTE)
       return p;
-  for (s = CAST_ALIGNED (uword const *, p); ! (*s & unibyte_mask); s++)
+  for (s = CAST_ALIGNED (uword const *, p); ! (*s & hibyte_mask); s++)
     continue;
-  for (p = (char const *) s; ! (to_uchar (*p) & unibyte_mask); p++)
+  for (p = (char const *) s; ! (*p & HIBYTE); p++)
     continue;
   return p;
 }
@@ -564,7 +554,7 @@ skip_easy_bytes (char const *buf)
 bool
 buf_has_encoding_errors (char *buf, size_t size)
 {
-  if (! unibyte_mask)
+  if (MB_CUR_MAX <= 1)
     return false;
 
   mbstate_t mbs = { 0 };
@@ -2608,7 +2598,7 @@ main (int argc, char **argv)
     usage (EXIT_TROUBLE);
 
   build_mbclen_cache ();
-  initialize_unibyte_mask ();
+  init_easy_encoding ();
 
   /* In a unibyte locale, switch from fgrep to grep if
      the pattern matches words (where grep is typically faster).
diff --git a/tests/Makefile.am b/tests/Makefile.am
index a38303c..f349aa3 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -133,7 +133,6 @@ TESTS =						\
   turkish-I-without-dot				\
   turkish-eyes					\
   two-files					\
-  unibyte-binary				\
   unibyte-bracket-expr				\
   unibyte-negated-circumflex			\
   utf8-bracket					\
diff --git a/tests/pcre-z b/tests/pcre-z
index 4ce9a93..6bbde94 100755
--- a/tests/pcre-z
+++ b/tests/pcre-z
@@ -2,11 +2,10 @@
 # Test Perl regex with NUL-separated input
 . "${srcdir=.}/init.sh"; path_prepend_ ../src
 require_pcre_
-require_en_utf8_locale_
 
 REGEX=a
 
-printf '%s\n\0' abc def ghi aaa gah > in || framework_failure_
+printf "%s\n0" abc def ghi aaa gah | tr 0 \\0 > in
 
 grep -z "$REGEX" in > exp 2>err || fail_ 'Cannot do BRE (grep -z) match.'
 compare /dev/null err || fail_ 'stderr not empty on grep -z.'
@@ -21,8 +20,8 @@ grep -Pz "$REGEX" in > out 2>err || fail=1
 compare exp out || fail=1
 compare /dev/null err || fail=1
 
-printf '\303\200\0' >in0 # "À" followed by a NUL.
-LC_ALL=en_US.UTF-8 grep -z . in0 >out || fail=1
-cmp in0 out || fail=1
+printf '\200\0' >in0
+LC_ALL=C grep -z . in0 >out || fail=1
+compare in0 out || fail=1
 
 Exit $fail
diff --git a/tests/unibyte-binary b/tests/unibyte-binary
deleted file mode 100755
index 11325ee..0000000
--- a/tests/unibyte-binary
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/sh
-# Test binary files in unibyte locales with encoding errors
-
-# Copyright 2016 Free Software Foundation, Inc.
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-. "${srcdir=.}/init.sh"; path_prepend_ ../src
-require_unibyte_locale
-
-fail=0
-
-printf 'a\n\200\nb\n' >in || framework_failure_
-printf 'a\nBinary file in matches\n' >exp || framework_failure_
-grep . in >out || fail=1
-
-# In some unibyte locales, \200 is an encoding error;
-# in others, it is a valid character.  Allow either possibility.
-compare exp out || compare in out || fail=1
-
-Exit $fail
-- 
2.1.4

diff -Nru grep-2.23/debian/changelog grep-2.23/debian/changelog
--- grep-2.23/debian/changelog  2016-02-16 09:27:35.000000000 +0100
+++ grep-2.23/debian/changelog  2016-02-17 15:40:57.000000000 +0100
@@ -1,3 +1,11 @@
+grep (2.23-1.1) UNRELEASED; urgency=medium
+
+  * Non-maintainer upload.
+  * Add 0001-Revert-grep-fix-bug-with-with-invalid-unibyte-sequen.patch
+    (Closes: #814997)
+
+ -- Andreas Henriksson <andr...@fatal.se>  Wed, 17 Feb 2016 15:40:10 +0100
+
 grep (2.23-1) unstable; urgency=low
 
   * New upstream release.
diff -Nru 
grep-2.23/debian/patches/0001-Revert-grep-fix-bug-with-with-invalid-unibyte-sequen.patch
 
grep-2.23/debian/patches/0001-Revert-grep-fix-bug-with-with-invalid-unibyte-sequen.patch
--- 
grep-2.23/debian/patches/0001-Revert-grep-fix-bug-with-with-invalid-unibyte-sequen.patch
    1970-01-01 01:00:00.000000000 +0100
+++ 
grep-2.23/debian/patches/0001-Revert-grep-fix-bug-with-with-invalid-unibyte-sequen.patch
    2016-02-17 15:39:46.000000000 +0100
@@ -0,0 +1,216 @@
+From 3a3a38e0a8013b4d1058079a519bd0596b98b42b Mon Sep 17 00:00:00 2001
+From: Andreas Henriksson <andreas.henriks...@endian.se>
+Date: Wed, 17 Feb 2016 15:29:07 +0100
+Subject: [PATCH] Revert "grep: fix bug with with invalid unibyte sequence"
+
+This reverts commit 40ed879db22d57516a31fefd1c39416974b74ec4.
+
+Conflicts:
+       src/grep.c
+       tests/unibyte-binary
+---
+ src/grep.c           | 70 ++++++++++++++++++++++------------------------------
+ tests/Makefile.am    |  1 -
+ tests/pcre-z         |  9 +++----
+ tests/unibyte-binary | 32 ------------------------
+ 4 files changed, 34 insertions(+), 78 deletions(-)
+ delete mode 100755 tests/unibyte-binary
+
+diff --git a/src/grep.c b/src/grep.c
+index 73c3651..be7d201 100644
+--- a/src/grep.c
++++ b/src/grep.c
+@@ -484,6 +484,21 @@ clean_up_stdout (void)
+     close_stdout ();
+ }
+ 
++/* The high-order bit of a byte.  */
++enum { HIBYTE = 0x80 };
++
++/* True if every byte with HIBYTE off is a single-byte character.
++   UTF-8 has this property.  */
++static bool easy_encoding;
++
++static void
++init_easy_encoding (void)
++{
++  easy_encoding = true;
++  for (int i = 0; i < HIBYTE; i++)
++    easy_encoding &= mbclen_cache[i] == 1;
++}
++
+ /* A cast to TYPE of VAL.  Use this when TYPE is a pointer type, VAL
+    is properly aligned for TYPE, and 'gcc -Wcast-align' cannot infer
+    the alignment and would otherwise complain about the cast.  */
+@@ -502,46 +517,21 @@ clean_up_stdout (void)
+ /* An unsigned type suitable for fast matching.  */
+ typedef uintmax_t uword;
+ 
+-/* A mask to test for unibyte characters, with the pattern repeated to
+-   fill a uword.  For a multibyte character encoding where
+-   all bytes are unibyte characters, this is 0.  For UTF-8, this is
+-   0x808080....  For encodings where unibyte characters have no discerned
+-   pattern, this is all 1s.  The unsigned char C is a unibyte
+-   character if C & UNIBYTE_MASK is zero.  If the uword W is the
+-   concatenation of bytes, the bytes are all unibyte characters
+-   if W & UNIBYTE_MASK is zero.  */
+-static uword unibyte_mask;
+-
+-static void
+-initialize_unibyte_mask (void)
+-{
+-  /* For each encoding error I that MASK does not already match,
+-     accumulate I's most significant 1 bit by ORing it into MASK.
+-     Although any 1 bit of I could be used, in practice high-order
+-     bits work better.  */
+-  unsigned char mask = 0;
+-  int ms1b = 1;
+-  for (int i = 1; i <= UCHAR_MAX; i++)
+-    if (mbclen_cache[i] != 1 && ! (mask & i))
+-      {
+-        while (ms1b * 2 <= i)
+-          ms1b *= 2;
+-        mask |= ms1b;
+-      }
+-
+-  /* Now MASK will detect any encoding-error byte, although it may
+-     cry wolf and it may not be optimal.  Build a uword-length mask by
+-     repeating MASK.  */
+-  uword uword_max = -1;
+-  unibyte_mask = uword_max / UCHAR_MAX * mask;
+-}
+-
+ /* Skip the easy bytes in a buffer that is guaranteed to have a sentinel
+    that is not easy, and return a pointer to the first non-easy byte.
+-   The easy bytes all have UNIBYTE_MASK off.  */
++   In easy encodings, the easy bytes all have HIBYTE off.
++   In other encodings, no byte is easy.  */
+ static char const * _GL_ATTRIBUTE_PURE
+ skip_easy_bytes (char const *buf)
+ {
++  if (!easy_encoding)
++    return buf;
++
++  uword uword_max = -1;
++
++  /* 0x8080..., extended to be wide enough for uword.  */
++  uword hibyte_mask = uword_max / UCHAR_MAX * HIBYTE;
++
+   /* Search a byte at a time until the pointer is aligned, then a
+      uword at a time until a match is found, then a byte at a time to
+      identify the exact byte.  The uword search may go slightly past
+@@ -549,11 +539,11 @@ skip_easy_bytes (char const *buf)
+   char const *p;
+   uword const *s;
+   for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++)
+-    if (to_uchar (*p) & unibyte_mask)
++    if (*p & HIBYTE)
+       return p;
+-  for (s = CAST_ALIGNED (uword const *, p); ! (*s & unibyte_mask); s++)
++  for (s = CAST_ALIGNED (uword const *, p); ! (*s & hibyte_mask); s++)
+     continue;
+-  for (p = (char const *) s; ! (to_uchar (*p) & unibyte_mask); p++)
++  for (p = (char const *) s; ! (*p & HIBYTE); p++)
+     continue;
+   return p;
+ }
+@@ -564,7 +554,7 @@ skip_easy_bytes (char const *buf)
+ bool
+ buf_has_encoding_errors (char *buf, size_t size)
+ {
+-  if (! unibyte_mask)
++  if (MB_CUR_MAX <= 1)
+     return false;
+ 
+   mbstate_t mbs = { 0 };
+@@ -2608,7 +2598,7 @@ main (int argc, char **argv)
+     usage (EXIT_TROUBLE);
+ 
+   build_mbclen_cache ();
+-  initialize_unibyte_mask ();
++  init_easy_encoding ();
+ 
+   /* In a unibyte locale, switch from fgrep to grep if
+      the pattern matches words (where grep is typically faster).
+diff --git a/tests/Makefile.am b/tests/Makefile.am
+index a38303c..f349aa3 100644
+--- a/tests/Makefile.am
++++ b/tests/Makefile.am
+@@ -133,7 +133,6 @@ TESTS =                                            \
+   turkish-I-without-dot                               \
+   turkish-eyes                                        \
+   two-files                                   \
+-  unibyte-binary                              \
+   unibyte-bracket-expr                                \
+   unibyte-negated-circumflex                  \
+   utf8-bracket                                        \
+diff --git a/tests/pcre-z b/tests/pcre-z
+index 4ce9a93..6bbde94 100755
+--- a/tests/pcre-z
++++ b/tests/pcre-z
+@@ -2,11 +2,10 @@
+ # Test Perl regex with NUL-separated input
+ . "${srcdir=.}/init.sh"; path_prepend_ ../src
+ require_pcre_
+-require_en_utf8_locale_
+ 
+ REGEX=a
+ 
+-printf '%s\n\0' abc def ghi aaa gah > in || framework_failure_
++printf "%s\n0" abc def ghi aaa gah | tr 0 \\0 > in
+ 
+ grep -z "$REGEX" in > exp 2>err || fail_ 'Cannot do BRE (grep -z) match.'
+ compare /dev/null err || fail_ 'stderr not empty on grep -z.'
+@@ -21,8 +20,8 @@ grep -Pz "$REGEX" in > out 2>err || fail=1
+ compare exp out || fail=1
+ compare /dev/null err || fail=1
+ 
+-printf '\303\200\0' >in0 # "À" followed by a NUL.
+-LC_ALL=en_US.UTF-8 grep -z . in0 >out || fail=1
+-cmp in0 out || fail=1
++printf '\200\0' >in0
++LC_ALL=C grep -z . in0 >out || fail=1
++compare in0 out || fail=1
+ 
+ Exit $fail
+diff --git a/tests/unibyte-binary b/tests/unibyte-binary
+deleted file mode 100755
+index 11325ee..0000000
+--- a/tests/unibyte-binary
++++ /dev/null
+@@ -1,32 +0,0 @@
+-#!/bin/sh
+-# Test binary files in unibyte locales with encoding errors
+-
+-# Copyright 2016 Free Software Foundation, Inc.
+-
+-# This program is free software: you can redistribute it and/or modify
+-# it under the terms of the GNU General Public License as published by
+-# the Free Software Foundation, either version 3 of the License, or
+-# (at your option) any later version.
+-
+-# This program is distributed in the hope that it will be useful,
+-# but WITHOUT ANY WARRANTY; without even the implied warranty of
+-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+-# GNU General Public License for more details.
+-
+-# You should have received a copy of the GNU General Public License
+-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-
+-. "${srcdir=.}/init.sh"; path_prepend_ ../src
+-require_unibyte_locale
+-
+-fail=0
+-
+-printf 'a\n\200\nb\n' >in || framework_failure_
+-printf 'a\nBinary file in matches\n' >exp || framework_failure_
+-grep . in >out || fail=1
+-
+-# In some unibyte locales, \200 is an encoding error;
+-# in others, it is a valid character.  Allow either possibility.
+-compare exp out || compare in out || fail=1
+-
+-Exit $fail
+-- 
+2.1.4
+
diff -Nru grep-2.23/debian/patches/series grep-2.23/debian/patches/series
--- grep-2.23/debian/patches/series     2016-02-15 17:58:22.000000000 +0100
+++ grep-2.23/debian/patches/series     2016-02-17 15:39:55.000000000 +0100
@@ -2,3 +2,4 @@
 03-397262-dlopen-pcre.patch
 05-grep-wrapper-sh.patch
 80-587930-man-ere-reference.patch
+0001-Revert-grep-fix-bug-with-with-invalid-unibyte-sequen.patch

Reply via email to