Hi Stefan,

Stefan Sperling wrote on Thu, Mar 14, 2019 at 04:11:00PM +0100:

> Yes, OK.

Thanks for checking.

Here is the next step outside line.c, completely cleaning up the file
less/filename.c with respect to UTF-8 handling.  The loop needed is very
similar to the one in search.c except that the case data[i] == '\0' can
occur here, resulting in a 0 return value from mbtowc(3), which has to
be counted as a binary character.

Note that the patch does change behaviour even for LC_CTYPE=C.
Currently, arcane ASCII (C0) control characters like 0x01 SOH,
0x02 STX, and so on are not counted as binary, but i think they should.
Currently, with LC_CTYPE=C, only bytes with the high bit set are
counted as binary.  The following patch counts everything as binary
except:
 * printable characters
 * whitespace characters
 * the 0x08 backspace character
 * and (with -R only) the 0x1b escape character

There is no need to handle the ASCII and UTF-8 cases separately.
With LC_CTYPE=C, the new code will advance byte by byte and count
non-ASCII bytes as binary as it should.

The third (n - i) argument to mbtowc(3) prevents buffer overruns;
no harm is done when n - i exceeds MB_CUR_MAX.

The additional, inner loop on escape sequences has no effect and
can be deleted.  The function is_ansi_middle() only returns true
for printable ASCII characters, and for those, the outer loop is
already good enough.  Neither the old nor the new code tries to
check whether the escape sequence ends in a valid or invalid manner.
One could count the ESC as binary for invalid sequences - but it
is not quite clear which sequences should be considered valid and
i don't think this minor detail is worth adding any complexity.
The function only provides crude heuristics in the first place.

This patch cleans up the only caller of the bad function binary_char(),
which can consequently be deleted.

While here, use ssize_t for the return value of read(2) rather than
int, even though it isn't a bug since the nbytes argument is always
the constant 256.  And avoid an idiom that relies on undefined
behaviour by moving p out of the buffer on read(2) failure; the
latter change does the job more portably with one variable less
on the stack.

OK?
  Ingo


Index: filename.c
===================================================================
RCS file: /cvs/src/usr.bin/less/filename.c,v
retrieving revision 1.27
diff -u -p -r1.27 filename.c
--- filename.c  26 Feb 2019 11:01:54 -0000      1.27
+++ filename.c  14 Mar 2019 18:15:54 -0000
@@ -334,25 +334,25 @@ fcomplete(char *s)
 int
 bin_file(int f)
 {
-       int n;
-       int bin_count = 0;
        char data[256];
-       char *p;
-       char *pend;
+       ssize_t i, n;
+       wchar_t ch;
+       int bin_count, len;
 
        if (!seekable(f))
                return (0);
        if (lseek(f, (off_t)0, SEEK_SET) == (off_t)-1)
                return (0);
        n = read(f, data, sizeof (data));
-       pend = &data[n];
-       for (p = data; p < pend; ) {
-               LWCHAR c = step_char(&p, +1, pend);
-               if (ctldisp == OPT_ONPLUS && c == ESC) {
-                       do {
-                               c = step_char(&p, +1, pend);
-                       } while (p < pend && is_ansi_middle(c));
-               } else if (binary_char(c))
+       bin_count = 0;
+       for (i = 0; i < n; i += len) {
+               len = mbtowc(&ch, data + i, n - i);
+               if (len <= 0) {
+                       bin_count++;
+                       len = 1;
+               } else if (iswprint(ch) == 0 && iswspace(ch) == 0 &&
+                   data[i] != '\b' &&
+                   (ctldisp != OPT_ONPLUS || data[i] != ESC))
                        bin_count++;
        }
        /*
Index: charset.c
===================================================================
RCS file: /cvs/src/usr.bin/less/charset.c,v
retrieving revision 1.21
diff -u -p -r1.21 charset.c
--- charset.c   20 Apr 2017 21:23:16 -0000      1.21
+++ charset.c   14 Mar 2019 18:15:54 -0000
@@ -146,18 +146,6 @@ init_charset(void)
 }
 
 /*
- * Is a given character a "binary" character?
- */
-int
-binary_char(LWCHAR c)
-{
-       if (utf_mode)
-               return (is_ubin_char(c));
-       c &= 0377;
-       return (!isprint((unsigned char)c) && !iscntrl((unsigned char)c));
-}
-
-/*
  * Is a given character a "control" character?
  */
 int
Index: funcs.h
===================================================================
RCS file: /cvs/src/usr.bin/less/funcs.h,v
retrieving revision 1.20
diff -u -p -r1.20 funcs.h
--- funcs.h     1 Mar 2019 14:31:34 -0000       1.20
+++ funcs.h     14 Mar 2019 18:15:54 -0000
@@ -56,7 +56,6 @@ void ch_init(int, int);
 void ch_close(void);
 int ch_getflags(void);
 void init_charset(void);
-int binary_char(LWCHAR);
 int control_char(LWCHAR);
 char *prchar(LWCHAR);
 char *prutfchar(LWCHAR);

Reply via email to