bug#20526: grep BUG: text file is detected as binary

Paul Eggert Fri, 01 Jan 2016 21:24:25 -0800

Norihiro Tanaka wrote:

why this check is applied in only multi-byte locale?

Ouch, good point. I missed the possibility of a unibyte encoding where not allbytes are valid unibyte characters. I installed the attached additional patch tofix this, and to test for the bug I recently introduced here.

From d31900e3fc2406ccb8aa972dd173f27f583c95ec Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 1 Jan 2016 21:16:12 -0800
Subject: [PATCH] grep: fix bug with with invalid unibyte sequence

This was introduced by the recent binary-data-detection changes.
Problem reported by Norihiro Tanaka in: http://bugs.gnu.org/20526#86
* src/grep.c (HIBYTE, easy_encoding, init_easy_encoding): Remove,
replacing with ...
(uword_max, unibyte_mask, initialize_unibyte_mask): ... this new
constant, static var, and function.  All uses changed.  The
unibyte_mask var generalizes the old local var hibyte_mask, which
worked only for encodings where every byte with 0x80 turned off is
a single-byte character.
(buf_has_encoding_errors): Return false immediately if
unibyte_mask is zero, not whether the current encoding is unibyte.
The old test was incorrect in unibyte locales in which some bytes
were encoding errors.
* tests/pcre-z: Require UTF-8 locale, since the grep -z . test now
needs this.  Use printf \0 rather than tr.  Port the 'grep -z .'
test to platforms where the C locale says '\200' is an encoding
error.  Use cmp rather than compare, as the file is binary and
so non-GNU diff might not work.
* tests/unibyte-binary: New file.
* tests/Makefile.am (TESTS): Add it.
---
 src/grep.c           | 57 +++++++++++++++++++++++++---------------------------
 tests/Makefile.am    |  1 +
 tests/pcre-z         |  9 +++++----
 tests/unibyte-binary | 28 ++++++++++++++++++++++++++
 4 files changed, 61 insertions(+), 34 deletions(-)
 create mode 100755 tests/unibyte-binary

diff --git a/src/grep.c b/src/grep.c
index 1207a76..a5f1fa2 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -484,21 +484,6 @@ clean_up_stdout (void)
     close_stdout ();
 }
 
-/* The high-order bit of a byte.  */
-enum { HIBYTE = 0x80 };
-
-/* True if every byte with HIBYTE off is a single-byte character.
-   UTF-8 has this property.  */
-static bool easy_encoding;
-
-static void
-init_easy_encoding (void)
-{
-  easy_encoding = true;
-  for (int i = 0; i < HIBYTE; i++)
-    easy_encoding &= mbclen_cache[i] == 1;
-}
-
 /* A cast to TYPE of VAL.  Use this when TYPE is a pointer type, VAL
    is properly aligned for TYPE, and 'gcc -Wcast-align' cannot infer
    the alignment and would otherwise complain about the cast.  */
@@ -517,21 +502,33 @@ init_easy_encoding (void)
 /* An unsigned type suitable for fast matching.  */
 typedef uintmax_t uword;
 
+/* All bytes that are not unibyte characters, ANDed together, and then
+   with the pattern repeated to fill a uword.  For an encoding where
+   all bytes are unibyte characters, this is 0.  For UTF-8, this is
+   0x808080....  For encodings where unibyte characters have no useful
+   pattern, this is all 1s.  The unsigned char C is a unibyte
+   character if C & UNIBYTE_MASK is zero.  If the uword W is the
+   concatenation of bytes, the bytes are all unibyte characters
+   if W & UNIBYTE_MASK is zero.  */
+static uword unibyte_mask;
+
+static void
+initialize_unibyte_mask (void)
+{
+  unsigned char mask = UCHAR_MAX;
+  for (int i = 1; i <= UCHAR_MAX; i++)
+    if (mbclen_cache[i] != 1)
+      mask &= i;
+  uword uword_max = -1;
+  unibyte_mask = uword_max / UCHAR_MAX * mask;
+}
+
 /* Skip the easy bytes in a buffer that is guaranteed to have a sentinel
    that is not easy, and return a pointer to the first non-easy byte.
-   In easy encodings, the easy bytes all have HIBYTE off.
-   In other encodings, no byte is easy.  */
+   The easy bytes all have UNIBYTE_MASK off.  */
 static char const * _GL_ATTRIBUTE_PURE
 skip_easy_bytes (char const *buf)
 {
-  if (!easy_encoding)
-    return buf;
-
-  uword uword_max = -1;
-
-  /* 0x8080..., extended to be wide enough for uword.  */
-  uword hibyte_mask = uword_max / UCHAR_MAX * HIBYTE;
-
   /* Search a byte at a time until the pointer is aligned, then a
      uword at a time until a match is found, then a byte at a time to
      identify the exact byte.  The uword search may go slightly past
@@ -539,11 +536,11 @@ skip_easy_bytes (char const *buf)
   char const *p;
   uword const *s;
   for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++)
-    if (*p & HIBYTE)
+    if (to_uchar (*p) & unibyte_mask)
       return p;
-  for (s = CAST_ALIGNED (uword const *, p); ! (*s & hibyte_mask); s++)
+  for (s = CAST_ALIGNED (uword const *, p); ! (*s & unibyte_mask); s++)
     continue;
-  for (p = (char const *) s; ! (*p & HIBYTE); p++)
+  for (p = (char const *) s; ! (to_uchar (*p) & unibyte_mask); p++)
     continue;
   return p;
 }
@@ -554,7 +551,7 @@ skip_easy_bytes (char const *buf)
 static bool
 buf_has_encoding_errors (char *buf, size_t size)
 {
-  if (MB_CUR_MAX <= 1)
+  if (! unibyte_mask)
     return false;
 
   mbstate_t mbs = { 0 };
@@ -2592,7 +2589,7 @@ main (int argc, char **argv)
     usage (EXIT_TROUBLE);
 
   build_mbclen_cache ();
-  init_easy_encoding ();
+  initialize_unibyte_mask ();
 
   /* In a unibyte locale, switch from fgrep to grep if
      the pattern matches words (where grep is typically faster).
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f349aa3..a38303c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -133,6 +133,7 @@ TESTS =                                             \
   turkish-I-without-dot                                \
   turkish-eyes                                 \
   two-files                                    \
+  unibyte-binary                               \
   unibyte-bracket-expr                         \
   unibyte-negated-circumflex                   \
   utf8-bracket                                 \
diff --git a/tests/pcre-z b/tests/pcre-z
index 6bbde94..4ce9a93 100755
--- a/tests/pcre-z
+++ b/tests/pcre-z
@@ -2,10 +2,11 @@
 # Test Perl regex with NUL-separated input
 . "${srcdir=.}/init.sh"; path_prepend_ ../src
 require_pcre_
+require_en_utf8_locale_
 
 REGEX=a
 
-printf "%s\n0" abc def ghi aaa gah | tr 0 \\0 > in
+printf '%s\n\0' abc def ghi aaa gah > in || framework_failure_
 
 grep -z "$REGEX" in > exp 2>err || fail_ 'Cannot do BRE (grep -z) match.'
 compare /dev/null err || fail_ 'stderr not empty on grep -z.'
@@ -20,8 +21,8 @@ grep -Pz "$REGEX" in > out 2>err || fail=1
 compare exp out || fail=1
 compare /dev/null err || fail=1
 
-printf '\200\0' >in0
-LC_ALL=C grep -z . in0 >out || fail=1
-compare in0 out || fail=1
+printf '\303\200\0' >in0 # "À" followed by a NUL.
+LC_ALL=en_US.UTF-8 grep -z . in0 >out || fail=1
+cmp in0 out || fail=1
 
 Exit $fail
diff --git a/tests/unibyte-binary b/tests/unibyte-binary
new file mode 100755
index 0000000..78735b8
--- /dev/null
+++ b/tests/unibyte-binary
@@ -0,0 +1,28 @@
+#!/bin/sh
+# Test binary files in unibyte locales with encoding errors
+
+# Copyright 2016 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_unibyte_locale
+
+fail=0
+
+printf 'a\n\200\nb\n' >in || framework_failure_
+printf 'a\nBinary file in matches\n' >exp || framework_failure_
+grep . in >out || fail=1
+compare exp out || fail=1
+Exit $fail
-- 
2.5.0

bug#20526: grep BUG: text file is detected as binary

Reply via email to