UTF-8 support for wc(1)

Ingo Schwarze Sun, 29 Nov 2015 08:52:35 -0800

Hi,

our wc(1) utility currently violates POSIX in two ways:


 1. The -m option counts bytes instead of characters.
    The patch given below fixes that.

 2. Word counting with -w only treats ASCII whitespace as word
    boundaries and regards two words joined by non-ASCII whitespace
    as one single word.

The second issue is not related to UTF-8, but a matter of full
Unicode support.  It would not be hard to fix that by using
mbtowc(3) and iswblank(3) instead of mblen(3).  However, i don't
think we want to pollute our base system tools with functions
requiring full Unicode support, not even to the extent available
in our own C library.  So i consider iswblank(3) taboo for now.

A few notes about the patch:

 * As usual, reduce the ridiculous setlocale(LC_CTYPE, "")
   to what is actually needed, setlocale(LC_ALL, "").

 * As usual, -m only differs from -c if LC_CTYPE is set
   to a multibyte encoding.

 * In the case  /* Do it the hard way... */,
   we need to switch from read(2) to getline(3)
   because read(2) might chop multibyte characters to pieces.
   That doesn't affect memory consumption of "wc -l" or "wc -c",
   not even for huge binary files without newline characters.
   It does increase memory consumption for files with very long
   lines when -w or -m is requested - but that's not a problem
   because both only make sense with real text, and real text
   does not have lines of a length that getline(3) is unable
   to handle.

OK?
  Ingo


Index: wc.1
===================================================================
RCS file: /cvs/src/usr.bin/wc/wc.1,v
retrieving revision 1.25
diff -u -p -r1.25 wc.1
--- wc.1        21 Apr 2015 10:46:48 -0000      1.25
+++ wc.1        29 Nov 2015 16:34:28 -0000
@@ -72,9 +72,7 @@ using powers of 2 for sizes (K=1024, M=1
 The number of lines in each input file
 is written to the standard output.
 .It Fl m
-Intended to count characters instead of bytes;
-currently an alias for
-.Fl c .
+Count characters instead of bytes.
 .It Fl w
 The number of words in each input file
 is written to the standard output.
@@ -111,7 +109,8 @@ The
 .Nm
 utility is compliant with the
 .St -p1003.1-2008
-specification, except that it ignores the locale.
+specification, except that it recognizes word boundaries only at ASCII
+whitespace.
 .Pp
 The flag
 .Op Fl h
@@ -121,7 +120,16 @@ A
 .Nm
 utility appeared in
 .At v1 .
-.Sh BUGS
+.Sh CAVEATS
 The
 .Fl m
-option counts bytes instead of characters.
+option depends on the character set
+.Xr locale 1 .
+If
+.Ev LC_CTYPE
+is set to
+.Qq C
+or
+.Qq POSIX ,
+it has the same effect as
+.Fl c .
Index: wc.c
===================================================================
RCS file: /cvs/src/usr.bin/wc/wc.c,v
retrieving revision 1.19
diff -u -p -r1.19 wc.c
--- wc.c        9 Oct 2015 01:37:09 -0000       1.19
+++ wc.c        29 Nov 2015 16:34:28 -0000
@@ -42,7 +42,7 @@
 #include <util.h>
 
 int64_t        tlinect, twordct, tcharct;
-int    doline, doword, dochar, humanchar;
+int    doline, doword, dochar, humanchar, multibyte;
 int    rval;
 extern char *__progname;
 
@@ -55,7 +55,7 @@ main(int argc, char *argv[])
 {
        int ch;
 
-       setlocale(LC_ALL, "");
+       setlocale(LC_CTYPE, "");
 
        if (pledge("stdio rpath", NULL) == -1)
                err(1, "pledge");
@@ -68,8 +68,11 @@ main(int argc, char *argv[])
                case 'w':
                        doword = 1;
                        break;
-               case 'c':
                case 'm':
+                       if (MB_CUR_MAX > 1)
+                               multibyte = 1;
+                       /* FALLTHROUGH */
+               case 'c':
                        dochar = 1;
                        break;
                case 'h':
@@ -112,15 +115,19 @@ main(int argc, char *argv[])
 void
 cnt(char *file)
 {
+       static char *buf;
+       static ssize_t bufsz;
+
+       FILE *stream;
        u_char *C;
        short gotsp;
-       int len;
+       ssize_t len;
        int64_t linect, wordct, charct;
        struct stat sbuf;
        int fd;
-       u_char buf[MAXBSIZE];
 
        linect = wordct = charct = 0;
+       stream = NULL;
        if (file) {
                if ((fd = open(file, O_RDONLY, 0)) < 0) {
                        warn("%s", file);
@@ -131,7 +138,10 @@ cnt(char *file)
                fd = STDIN_FILENO;
        }
 
-       if (!doword) {
+       if (!doword && !multibyte) {
+               if (bufsz < MAXBSIZE &&
+                   (buf = realloc(buf, MAXBSIZE)) == NULL)
+                       err(1, NULL);
                /*
                 * Line counting is split out because it's a lot
                 * faster to get lines than to get words, since
@@ -178,16 +188,25 @@ cnt(char *file)
                        }
                }
        } else {
+               if (file == NULL)
+                       stream = stdin;
+               else if ((stream = fdopen(fd, "r")) == NULL) {
+                       warn("%s", file);
+                       close(fd);
+                       rval = 1;
+                       return;
+               }
+
                /* Do it the hard way... */
                gotsp = 1;
-               while ((len = read(fd, buf, MAXBSIZE)) > 0) {
-                       /*
-                        * This loses in the presence of multi-byte characters.
-                        * To do it right would require a function to return a
-                        * character while knowing how many bytes it consumed.
-                        */
-                       charct += len;
-                       for (C = buf; len--; ++C) {
+               while ((len = getline(&buf, &bufsz, stream)) > 0) {
+                       for (C = buf; *C != '\0'; ++C) {
+                               ++charct;
+                               /*
+                                * XXX For now, we don't want full
+                                * Unicode support.  Only treat ASCII
+                                * whitespace as whitespace.
+                                */
                                if (isspace(*C)) {
                                        gotsp = 1;
                                        if (*C == '\n')
@@ -205,10 +224,13 @@ cnt(char *file)
                                                gotsp = 0;
                                                ++wordct;
                                        }
+                                       if (multibyte &&
+                                           (len = mblen(C, MB_CUR_MAX)) > 1)
+                                               C += len - 1;
                                }
                        }
                }
-               if (len == -1) {
+               if (ferror(stream)) {
                        warn("%s", file);
                        rval = 1;
                }
@@ -224,7 +246,7 @@ cnt(char *file)
        twordct += wordct;
        tcharct += charct;
 
-       if (close(fd) != 0) {
+       if ((stream == NULL ? close(fd) : fclose(stream)) != 0) {
                warn("%s", file);
                rval = 1;
        }

UTF-8 support for wc(1)

Reply via email to