Hello! I've bisected the upstream git repo v2.22..v2.23 and found that the issue is caused by commit 40ed879db22d57516a31fefd1c39416974b74ec4 "grep: fix bug with with invalid unibyte sequence"
I've verified that using v2.23 with only the above commit reverted avoids the problem (see attached patch for my exact conflict resolution). I also tested building the debian grep package with the patch attached. As expected that also avoids the issue. Attached debdiff. I hope this can be used as an interim fix to avoid pkg-gnome packages loosing Uploaders information until someone with deeper knowledge can dive into whats causing the problem in the upstream change. Regards, Andreas Henriksson
>From 3a3a38e0a8013b4d1058079a519bd0596b98b42b Mon Sep 17 00:00:00 2001 From: Andreas Henriksson <andreas.henriks...@endian.se> Date: Wed, 17 Feb 2016 15:29:07 +0100 Subject: [PATCH] Revert "grep: fix bug with with invalid unibyte sequence" This reverts commit 40ed879db22d57516a31fefd1c39416974b74ec4. Conflicts: src/grep.c tests/unibyte-binary --- src/grep.c | 70 ++++++++++++++++++++++------------------------------ tests/Makefile.am | 1 - tests/pcre-z | 9 +++---- tests/unibyte-binary | 32 ------------------------ 4 files changed, 34 insertions(+), 78 deletions(-) delete mode 100755 tests/unibyte-binary diff --git a/src/grep.c b/src/grep.c index 73c3651..be7d201 100644 --- a/src/grep.c +++ b/src/grep.c @@ -484,6 +484,21 @@ clean_up_stdout (void) close_stdout (); } +/* The high-order bit of a byte. */ +enum { HIBYTE = 0x80 }; + +/* True if every byte with HIBYTE off is a single-byte character. + UTF-8 has this property. */ +static bool easy_encoding; + +static void +init_easy_encoding (void) +{ + easy_encoding = true; + for (int i = 0; i < HIBYTE; i++) + easy_encoding &= mbclen_cache[i] == 1; +} + /* A cast to TYPE of VAL. Use this when TYPE is a pointer type, VAL is properly aligned for TYPE, and 'gcc -Wcast-align' cannot infer the alignment and would otherwise complain about the cast. */ @@ -502,46 +517,21 @@ clean_up_stdout (void) /* An unsigned type suitable for fast matching. */ typedef uintmax_t uword; -/* A mask to test for unibyte characters, with the pattern repeated to - fill a uword. For a multibyte character encoding where - all bytes are unibyte characters, this is 0. For UTF-8, this is - 0x808080.... For encodings where unibyte characters have no discerned - pattern, this is all 1s. The unsigned char C is a unibyte - character if C & UNIBYTE_MASK is zero. If the uword W is the - concatenation of bytes, the bytes are all unibyte characters - if W & UNIBYTE_MASK is zero. */ -static uword unibyte_mask; - -static void -initialize_unibyte_mask (void) -{ - /* For each encoding error I that MASK does not already match, - accumulate I's most significant 1 bit by ORing it into MASK. - Although any 1 bit of I could be used, in practice high-order - bits work better. */ - unsigned char mask = 0; - int ms1b = 1; - for (int i = 1; i <= UCHAR_MAX; i++) - if (mbclen_cache[i] != 1 && ! (mask & i)) - { - while (ms1b * 2 <= i) - ms1b *= 2; - mask |= ms1b; - } - - /* Now MASK will detect any encoding-error byte, although it may - cry wolf and it may not be optimal. Build a uword-length mask by - repeating MASK. */ - uword uword_max = -1; - unibyte_mask = uword_max / UCHAR_MAX * mask; -} - /* Skip the easy bytes in a buffer that is guaranteed to have a sentinel that is not easy, and return a pointer to the first non-easy byte. - The easy bytes all have UNIBYTE_MASK off. */ + In easy encodings, the easy bytes all have HIBYTE off. + In other encodings, no byte is easy. */ static char const * _GL_ATTRIBUTE_PURE skip_easy_bytes (char const *buf) { + if (!easy_encoding) + return buf; + + uword uword_max = -1; + + /* 0x8080..., extended to be wide enough for uword. */ + uword hibyte_mask = uword_max / UCHAR_MAX * HIBYTE; + /* Search a byte at a time until the pointer is aligned, then a uword at a time until a match is found, then a byte at a time to identify the exact byte. The uword search may go slightly past @@ -549,11 +539,11 @@ skip_easy_bytes (char const *buf) char const *p; uword const *s; for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++) - if (to_uchar (*p) & unibyte_mask) + if (*p & HIBYTE) return p; - for (s = CAST_ALIGNED (uword const *, p); ! (*s & unibyte_mask); s++) + for (s = CAST_ALIGNED (uword const *, p); ! (*s & hibyte_mask); s++) continue; - for (p = (char const *) s; ! (to_uchar (*p) & unibyte_mask); p++) + for (p = (char const *) s; ! (*p & HIBYTE); p++) continue; return p; } @@ -564,7 +554,7 @@ skip_easy_bytes (char const *buf) bool buf_has_encoding_errors (char *buf, size_t size) { - if (! unibyte_mask) + if (MB_CUR_MAX <= 1) return false; mbstate_t mbs = { 0 }; @@ -2608,7 +2598,7 @@ main (int argc, char **argv) usage (EXIT_TROUBLE); build_mbclen_cache (); - initialize_unibyte_mask (); + init_easy_encoding (); /* In a unibyte locale, switch from fgrep to grep if the pattern matches words (where grep is typically faster). diff --git a/tests/Makefile.am b/tests/Makefile.am index a38303c..f349aa3 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -133,7 +133,6 @@ TESTS = \ turkish-I-without-dot \ turkish-eyes \ two-files \ - unibyte-binary \ unibyte-bracket-expr \ unibyte-negated-circumflex \ utf8-bracket \ diff --git a/tests/pcre-z b/tests/pcre-z index 4ce9a93..6bbde94 100755 --- a/tests/pcre-z +++ b/tests/pcre-z @@ -2,11 +2,10 @@ # Test Perl regex with NUL-separated input . "${srcdir=.}/init.sh"; path_prepend_ ../src require_pcre_ -require_en_utf8_locale_ REGEX=a -printf '%s\n\0' abc def ghi aaa gah > in || framework_failure_ +printf "%s\n0" abc def ghi aaa gah | tr 0 \\0 > in grep -z "$REGEX" in > exp 2>err || fail_ 'Cannot do BRE (grep -z) match.' compare /dev/null err || fail_ 'stderr not empty on grep -z.' @@ -21,8 +20,8 @@ grep -Pz "$REGEX" in > out 2>err || fail=1 compare exp out || fail=1 compare /dev/null err || fail=1 -printf '\303\200\0' >in0 # "À" followed by a NUL. -LC_ALL=en_US.UTF-8 grep -z . in0 >out || fail=1 -cmp in0 out || fail=1 +printf '\200\0' >in0 +LC_ALL=C grep -z . in0 >out || fail=1 +compare in0 out || fail=1 Exit $fail diff --git a/tests/unibyte-binary b/tests/unibyte-binary deleted file mode 100755 index 11325ee..0000000 --- a/tests/unibyte-binary +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/sh -# Test binary files in unibyte locales with encoding errors - -# Copyright 2016 Free Software Foundation, Inc. - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. - -. "${srcdir=.}/init.sh"; path_prepend_ ../src -require_unibyte_locale - -fail=0 - -printf 'a\n\200\nb\n' >in || framework_failure_ -printf 'a\nBinary file in matches\n' >exp || framework_failure_ -grep . in >out || fail=1 - -# In some unibyte locales, \200 is an encoding error; -# in others, it is a valid character. Allow either possibility. -compare exp out || compare in out || fail=1 - -Exit $fail -- 2.1.4
diff -Nru grep-2.23/debian/changelog grep-2.23/debian/changelog --- grep-2.23/debian/changelog 2016-02-16 09:27:35.000000000 +0100 +++ grep-2.23/debian/changelog 2016-02-17 15:40:57.000000000 +0100 @@ -1,3 +1,11 @@ +grep (2.23-1.1) UNRELEASED; urgency=medium + + * Non-maintainer upload. + * Add 0001-Revert-grep-fix-bug-with-with-invalid-unibyte-sequen.patch + (Closes: #814997) + + -- Andreas Henriksson <andr...@fatal.se> Wed, 17 Feb 2016 15:40:10 +0100 + grep (2.23-1) unstable; urgency=low * New upstream release. diff -Nru grep-2.23/debian/patches/0001-Revert-grep-fix-bug-with-with-invalid-unibyte-sequen.patch grep-2.23/debian/patches/0001-Revert-grep-fix-bug-with-with-invalid-unibyte-sequen.patch --- grep-2.23/debian/patches/0001-Revert-grep-fix-bug-with-with-invalid-unibyte-sequen.patch 1970-01-01 01:00:00.000000000 +0100 +++ grep-2.23/debian/patches/0001-Revert-grep-fix-bug-with-with-invalid-unibyte-sequen.patch 2016-02-17 15:39:46.000000000 +0100 @@ -0,0 +1,216 @@ +From 3a3a38e0a8013b4d1058079a519bd0596b98b42b Mon Sep 17 00:00:00 2001 +From: Andreas Henriksson <andreas.henriks...@endian.se> +Date: Wed, 17 Feb 2016 15:29:07 +0100 +Subject: [PATCH] Revert "grep: fix bug with with invalid unibyte sequence" + +This reverts commit 40ed879db22d57516a31fefd1c39416974b74ec4. + +Conflicts: + src/grep.c + tests/unibyte-binary +--- + src/grep.c | 70 ++++++++++++++++++++++------------------------------ + tests/Makefile.am | 1 - + tests/pcre-z | 9 +++---- + tests/unibyte-binary | 32 ------------------------ + 4 files changed, 34 insertions(+), 78 deletions(-) + delete mode 100755 tests/unibyte-binary + +diff --git a/src/grep.c b/src/grep.c +index 73c3651..be7d201 100644 +--- a/src/grep.c ++++ b/src/grep.c +@@ -484,6 +484,21 @@ clean_up_stdout (void) + close_stdout (); + } + ++/* The high-order bit of a byte. */ ++enum { HIBYTE = 0x80 }; ++ ++/* True if every byte with HIBYTE off is a single-byte character. ++ UTF-8 has this property. */ ++static bool easy_encoding; ++ ++static void ++init_easy_encoding (void) ++{ ++ easy_encoding = true; ++ for (int i = 0; i < HIBYTE; i++) ++ easy_encoding &= mbclen_cache[i] == 1; ++} ++ + /* A cast to TYPE of VAL. Use this when TYPE is a pointer type, VAL + is properly aligned for TYPE, and 'gcc -Wcast-align' cannot infer + the alignment and would otherwise complain about the cast. */ +@@ -502,46 +517,21 @@ clean_up_stdout (void) + /* An unsigned type suitable for fast matching. */ + typedef uintmax_t uword; + +-/* A mask to test for unibyte characters, with the pattern repeated to +- fill a uword. For a multibyte character encoding where +- all bytes are unibyte characters, this is 0. For UTF-8, this is +- 0x808080.... For encodings where unibyte characters have no discerned +- pattern, this is all 1s. The unsigned char C is a unibyte +- character if C & UNIBYTE_MASK is zero. If the uword W is the +- concatenation of bytes, the bytes are all unibyte characters +- if W & UNIBYTE_MASK is zero. */ +-static uword unibyte_mask; +- +-static void +-initialize_unibyte_mask (void) +-{ +- /* For each encoding error I that MASK does not already match, +- accumulate I's most significant 1 bit by ORing it into MASK. +- Although any 1 bit of I could be used, in practice high-order +- bits work better. */ +- unsigned char mask = 0; +- int ms1b = 1; +- for (int i = 1; i <= UCHAR_MAX; i++) +- if (mbclen_cache[i] != 1 && ! (mask & i)) +- { +- while (ms1b * 2 <= i) +- ms1b *= 2; +- mask |= ms1b; +- } +- +- /* Now MASK will detect any encoding-error byte, although it may +- cry wolf and it may not be optimal. Build a uword-length mask by +- repeating MASK. */ +- uword uword_max = -1; +- unibyte_mask = uword_max / UCHAR_MAX * mask; +-} +- + /* Skip the easy bytes in a buffer that is guaranteed to have a sentinel + that is not easy, and return a pointer to the first non-easy byte. +- The easy bytes all have UNIBYTE_MASK off. */ ++ In easy encodings, the easy bytes all have HIBYTE off. ++ In other encodings, no byte is easy. */ + static char const * _GL_ATTRIBUTE_PURE + skip_easy_bytes (char const *buf) + { ++ if (!easy_encoding) ++ return buf; ++ ++ uword uword_max = -1; ++ ++ /* 0x8080..., extended to be wide enough for uword. */ ++ uword hibyte_mask = uword_max / UCHAR_MAX * HIBYTE; ++ + /* Search a byte at a time until the pointer is aligned, then a + uword at a time until a match is found, then a byte at a time to + identify the exact byte. The uword search may go slightly past +@@ -549,11 +539,11 @@ skip_easy_bytes (char const *buf) + char const *p; + uword const *s; + for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++) +- if (to_uchar (*p) & unibyte_mask) ++ if (*p & HIBYTE) + return p; +- for (s = CAST_ALIGNED (uword const *, p); ! (*s & unibyte_mask); s++) ++ for (s = CAST_ALIGNED (uword const *, p); ! (*s & hibyte_mask); s++) + continue; +- for (p = (char const *) s; ! (to_uchar (*p) & unibyte_mask); p++) ++ for (p = (char const *) s; ! (*p & HIBYTE); p++) + continue; + return p; + } +@@ -564,7 +554,7 @@ skip_easy_bytes (char const *buf) + bool + buf_has_encoding_errors (char *buf, size_t size) + { +- if (! unibyte_mask) ++ if (MB_CUR_MAX <= 1) + return false; + + mbstate_t mbs = { 0 }; +@@ -2608,7 +2598,7 @@ main (int argc, char **argv) + usage (EXIT_TROUBLE); + + build_mbclen_cache (); +- initialize_unibyte_mask (); ++ init_easy_encoding (); + + /* In a unibyte locale, switch from fgrep to grep if + the pattern matches words (where grep is typically faster). +diff --git a/tests/Makefile.am b/tests/Makefile.am +index a38303c..f349aa3 100644 +--- a/tests/Makefile.am ++++ b/tests/Makefile.am +@@ -133,7 +133,6 @@ TESTS = \ + turkish-I-without-dot \ + turkish-eyes \ + two-files \ +- unibyte-binary \ + unibyte-bracket-expr \ + unibyte-negated-circumflex \ + utf8-bracket \ +diff --git a/tests/pcre-z b/tests/pcre-z +index 4ce9a93..6bbde94 100755 +--- a/tests/pcre-z ++++ b/tests/pcre-z +@@ -2,11 +2,10 @@ + # Test Perl regex with NUL-separated input + . "${srcdir=.}/init.sh"; path_prepend_ ../src + require_pcre_ +-require_en_utf8_locale_ + + REGEX=a + +-printf '%s\n\0' abc def ghi aaa gah > in || framework_failure_ ++printf "%s\n0" abc def ghi aaa gah | tr 0 \\0 > in + + grep -z "$REGEX" in > exp 2>err || fail_ 'Cannot do BRE (grep -z) match.' + compare /dev/null err || fail_ 'stderr not empty on grep -z.' +@@ -21,8 +20,8 @@ grep -Pz "$REGEX" in > out 2>err || fail=1 + compare exp out || fail=1 + compare /dev/null err || fail=1 + +-printf '\303\200\0' >in0 # "À" followed by a NUL. +-LC_ALL=en_US.UTF-8 grep -z . in0 >out || fail=1 +-cmp in0 out || fail=1 ++printf '\200\0' >in0 ++LC_ALL=C grep -z . in0 >out || fail=1 ++compare in0 out || fail=1 + + Exit $fail +diff --git a/tests/unibyte-binary b/tests/unibyte-binary +deleted file mode 100755 +index 11325ee..0000000 +--- a/tests/unibyte-binary ++++ /dev/null +@@ -1,32 +0,0 @@ +-#!/bin/sh +-# Test binary files in unibyte locales with encoding errors +- +-# Copyright 2016 Free Software Foundation, Inc. +- +-# This program is free software: you can redistribute it and/or modify +-# it under the terms of the GNU General Public License as published by +-# the Free Software Foundation, either version 3 of the License, or +-# (at your option) any later version. +- +-# This program is distributed in the hope that it will be useful, +-# but WITHOUT ANY WARRANTY; without even the implied warranty of +-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-# GNU General Public License for more details. +- +-# You should have received a copy of the GNU General Public License +-# along with this program. If not, see <http://www.gnu.org/licenses/>. +- +-. "${srcdir=.}/init.sh"; path_prepend_ ../src +-require_unibyte_locale +- +-fail=0 +- +-printf 'a\n\200\nb\n' >in || framework_failure_ +-printf 'a\nBinary file in matches\n' >exp || framework_failure_ +-grep . in >out || fail=1 +- +-# In some unibyte locales, \200 is an encoding error; +-# in others, it is a valid character. Allow either possibility. +-compare exp out || compare in out || fail=1 +- +-Exit $fail +-- +2.1.4 + diff -Nru grep-2.23/debian/patches/series grep-2.23/debian/patches/series --- grep-2.23/debian/patches/series 2016-02-15 17:58:22.000000000 +0100 +++ grep-2.23/debian/patches/series 2016-02-17 15:39:55.000000000 +0100 @@ -2,3 +2,4 @@ 03-397262-dlopen-pcre.patch 05-grep-wrapper-sh.patch 80-587930-man-ere-reference.patch +0001-Revert-grep-fix-bug-with-with-invalid-unibyte-sequen.patch