Re: UTF-8 support for wc(1)

Ingo Schwarze Fri, 04 Dec 2015 05:11:56 -0800

Hi Todd,

Todd C. Miller wrote on Thu, Dec 03, 2015 at 11:40:55AM -0700:
> On Sun, 29 Nov 2015 17:45:55 +0100, Ingo Schwarze wrote:


>> our wc(1) utility currently violates POSIX in two ways:
>> 
>>  1. The -m option counts bytes instead of characters.
>>     The patch given below fixes that.
>> 
>>  2. Word counting with -w only treats ASCII whitespace as word
>>     boundaries and regards two words joined by non-ASCII whitespace
>>     as one single word.
>> 
>> The second issue is not related to UTF-8, but a matter of full
>> Unicode support.  It would not be hard to fix that by using
>> mbtowc(3) and iswblank(3) instead of mblen(3).  However, i don't
>> think we want to pollute our base system tools with functions
>> requiring full Unicode support, not even to the extent available
>> in our own C library.  So i consider iswblank(3) taboo for now.

> I'm a little surprised by this.  It doesn't seem like it would be
> any more complicated to use mbtowc(3) and iswblank(3) for the
> multibyte case.

Reconsidering, your argument makes sense to me.  Even if we implement
a simplified lookup table in the future, it doesn't complicate matters.
We already include data for iswprint(3) and wcwidth(3); iswspace(3)
is not more expensive and probably about as often needed.

So let's include iswblank(3) and iswspace(3) into the list of
function that we are willing to use.  Of course, that still doesn't
mean that we can do full Unicode support (think of collations etc.).

So, here is a patch for wc(1) getting both character and word
counting right.  I also improved the manual in various respects.

OK?
  Ingo


Index: wc.1
===================================================================
RCS file: /cvs/src/usr.bin/wc/wc.1,v
retrieving revision 1.25
diff -u -p -r1.25 wc.1
--- wc.1        21 Apr 2015 10:46:48 -0000      1.25
+++ wc.1        4 Dec 2015 12:54:26 -0000
@@ -72,9 +72,10 @@ using powers of 2 for sizes (K=1024, M=1
 The number of lines in each input file
 is written to the standard output.
 .It Fl m
-Intended to count characters instead of bytes;
-currently an alias for
-.Fl c .
+Count characters instead of bytes, and use
+.Xr iswspace 3
+instead of
+.Xr isspace 3 .
 .It Fl w
 The number of words in each input file
 is written to the standard output.
@@ -102,6 +103,20 @@ lines       words  bytes   file_name
 The counts for lines, words, and bytes
 .Pq or characters
 are integers separated by spaces.
+.Sh ENVIRONMENT
+.Bl -tag -width LC_CTYPE
+.It Ev LC_CTYPE
+The character set
+.Xr locale 1 ,
+defining which byte sequences form characters.
+If unset or set to
+.Qq C ,
+.Qq POSIX ,
+or an unsupported value,
+.Fl m
+has the same effect as
+.Fl c .
+.El
 .Sh EXIT STATUS
 .Ex -std wc
 .Sh SEE ALSO
@@ -111,7 +126,7 @@ The
 .Nm
 utility is compliant with the
 .St -p1003.1-2008
-specification, except that it ignores the locale.
+specification.
 .Pp
 The flag
 .Op Fl h
@@ -121,7 +136,3 @@ A
 .Nm
 utility appeared in
 .At v1 .
-.Sh BUGS
-The
-.Fl m
-option counts bytes instead of characters.
Index: wc.c
===================================================================
RCS file: /cvs/src/usr.bin/wc/wc.c,v
retrieving revision 1.19
diff -u -p -r1.19 wc.c
--- wc.c        9 Oct 2015 01:37:09 -0000       1.19
+++ wc.c        4 Dec 2015 12:54:26 -0000
@@ -40,9 +40,11 @@
 #include <err.h>
 #include <unistd.h>
 #include <util.h>
+#include <wchar.h>
+#include <wctype.h>
 
 int64_t        tlinect, twordct, tcharct;
-int    doline, doword, dochar, humanchar;
+int    doline, doword, dochar, humanchar, multibyte;
 int    rval;
 extern char *__progname;
 
@@ -55,7 +57,7 @@ main(int argc, char *argv[])
 {
        int ch;
 
-       setlocale(LC_ALL, "");
+       setlocale(LC_CTYPE, "");
 
        if (pledge("stdio rpath", NULL) == -1)
                err(1, "pledge");
@@ -68,8 +70,11 @@ main(int argc, char *argv[])
                case 'w':
                        doword = 1;
                        break;
-               case 'c':
                case 'm':
+                       if (MB_CUR_MAX > 1)
+                               multibyte = 1;
+                       /* FALLTHROUGH */
+               case 'c':
                        dochar = 1;
                        break;
                case 'h':
@@ -112,15 +117,20 @@ main(int argc, char *argv[])
 void
 cnt(char *file)
 {
-       u_char *C;
+       static char *buf;
+       static ssize_t bufsz;
+
+       FILE *stream;
+       char *C;
+       wchar_t wc;
        short gotsp;
-       int len;
+       ssize_t len;
        int64_t linect, wordct, charct;
        struct stat sbuf;
        int fd;
-       u_char buf[MAXBSIZE];
 
        linect = wordct = charct = 0;
+       stream = NULL;
        if (file) {
                if ((fd = open(file, O_RDONLY, 0)) < 0) {
                        warn("%s", file);
@@ -131,7 +141,10 @@ cnt(char *file)
                fd = STDIN_FILENO;
        }
 
-       if (!doword) {
+       if (!doword && !multibyte) {
+               if (bufsz < MAXBSIZE &&
+                   (buf = realloc(buf, MAXBSIZE)) == NULL)
+                       err(1, NULL);
                /*
                 * Line counting is split out because it's a lot
                 * faster to get lines than to get words, since
@@ -178,37 +191,57 @@ cnt(char *file)
                        }
                }
        } else {
-               /* Do it the hard way... */
+               if (file == NULL)
+                       stream = stdin;
+               else if ((stream = fdopen(fd, "r")) == NULL) {
+                       warn("%s", file);
+                       close(fd);
+                       rval = 1;
+                       return;
+               }
+
+               /*
+                * Do it the hard way.
+                * According to POSIX, a word is a "maximal string of
+                * characters delimited by whitespace."  Nothing is said
+                * about a character being printing or non-printing.
+                */
                gotsp = 1;
-               while ((len = read(fd, buf, MAXBSIZE)) > 0) {
-                       /*
-                        * This loses in the presence of multi-byte characters.
-                        * To do it right would require a function to return a
-                        * character while knowing how many bytes it consumed.
-                        */
-                       charct += len;
-                       for (C = buf; len--; ++C) {
-                               if (isspace(*C)) {
-                                       gotsp = 1;
-                                       if (*C == '\n')
-                                               ++linect;
-                               } else {
-                                       /*
-                                        * This line implements the POSIX
-                                        * spec, i.e. a word is a "maximal
-                                        * string of characters delimited by
-                                        * whitespace."  Notice nothing was
-                                        * said about a character being
-                                        * printing or non-printing.
-                                        */
-                                       if (gotsp) {
+               while ((len = getline(&buf, &bufsz, stream)) > 0) {
+                       if (multibyte) {
+                               for (C = buf; *C != '\0'; C += len) {
+                                       ++charct;
+                                       len = mbtowc(&wc, C, MB_CUR_MAX);
+                                       if (len == -1) {
+                                               (void)mbtowc(NULL, NULL,
+                                                   MB_CUR_MAX);
+                                               len = 1;
+                                               wc = L' ';
+                                       }
+                                       if (iswspace(wc)) {
+                                               gotsp = 1;
+                                               if (wc == L'\n')
+                                                       ++linect;
+                                       } else if (gotsp) {
+                                               gotsp = 0;
+                                               ++wordct;
+                                       }
+                               }
+                       } else {
+                               charct += len;
+                               for (C = buf; *C != '\0'; ++C) {
+                                       if (isspace((unsigned char)*C)) {
+                                               gotsp = 1;
+                                               if (*C == '\n')
+                                                       ++linect;
+                                       } else if (gotsp) {
                                                gotsp = 0;
                                                ++wordct;
                                        }
                                }
                        }
                }
-               if (len == -1) {
+               if (ferror(stream)) {
                        warn("%s", file);
                        rval = 1;
                }
@@ -224,7 +257,7 @@ cnt(char *file)
        twordct += wordct;
        tcharct += charct;
 
-       if (close(fd) != 0) {
+       if ((stream == NULL ? close(fd) : fclose(stream)) != 0) {
                warn("%s", file);
                rval = 1;
        }

Re: UTF-8 support for wc(1)

Reply via email to