Hi,
our wc(1) utility currently violates POSIX in two ways:
1. The -m option counts bytes instead of characters.
The patch given below fixes that.
2. Word counting with -w only treats ASCII whitespace as word
boundaries and regards two words joined by non-ASCII whitespace
as one single word.
The second issue is not related to UTF-8, but a matter of full
Unicode support. It would not be hard to fix that by using
mbtowc(3) and iswblank(3) instead of mblen(3). However, i don't
think we want to pollute our base system tools with functions
requiring full Unicode support, not even to the extent available
in our own C library. So i consider iswblank(3) taboo for now.
A few notes about the patch:
* As usual, reduce the ridiculous setlocale(LC_CTYPE, "")
to what is actually needed, setlocale(LC_ALL, "").
* As usual, -m only differs from -c if LC_CTYPE is set
to a multibyte encoding.
* In the case /* Do it the hard way... */,
we need to switch from read(2) to getline(3)
because read(2) might chop multibyte characters to pieces.
That doesn't affect memory consumption of "wc -l" or "wc -c",
not even for huge binary files without newline characters.
It does increase memory consumption for files with very long
lines when -w or -m is requested - but that's not a problem
because both only make sense with real text, and real text
does not have lines of a length that getline(3) is unable
to handle.
OK?
Ingo
Index: wc.1
===================================================================
RCS file: /cvs/src/usr.bin/wc/wc.1,v
retrieving revision 1.25
diff -u -p -r1.25 wc.1
--- wc.1 21 Apr 2015 10:46:48 -0000 1.25
+++ wc.1 29 Nov 2015 16:34:28 -0000
@@ -72,9 +72,7 @@ using powers of 2 for sizes (K=1024, M=1
The number of lines in each input file
is written to the standard output.
.It Fl m
-Intended to count characters instead of bytes;
-currently an alias for
-.Fl c .
+Count characters instead of bytes.
.It Fl w
The number of words in each input file
is written to the standard output.
@@ -111,7 +109,8 @@ The
.Nm
utility is compliant with the
.St -p1003.1-2008
-specification, except that it ignores the locale.
+specification, except that it recognizes word boundaries only at ASCII
+whitespace.
.Pp
The flag
.Op Fl h
@@ -121,7 +120,16 @@ A
.Nm
utility appeared in
.At v1 .
-.Sh BUGS
+.Sh CAVEATS
The
.Fl m
-option counts bytes instead of characters.
+option depends on the character set
+.Xr locale 1 .
+If
+.Ev LC_CTYPE
+is set to
+.Qq C
+or
+.Qq POSIX ,
+it has the same effect as
+.Fl c .
Index: wc.c
===================================================================
RCS file: /cvs/src/usr.bin/wc/wc.c,v
retrieving revision 1.19
diff -u -p -r1.19 wc.c
--- wc.c 9 Oct 2015 01:37:09 -0000 1.19
+++ wc.c 29 Nov 2015 16:34:28 -0000
@@ -42,7 +42,7 @@
#include <util.h>
int64_t tlinect, twordct, tcharct;
-int doline, doword, dochar, humanchar;
+int doline, doword, dochar, humanchar, multibyte;
int rval;
extern char *__progname;
@@ -55,7 +55,7 @@ main(int argc, char *argv[])
{
int ch;
- setlocale(LC_ALL, "");
+ setlocale(LC_CTYPE, "");
if (pledge("stdio rpath", NULL) == -1)
err(1, "pledge");
@@ -68,8 +68,11 @@ main(int argc, char *argv[])
case 'w':
doword = 1;
break;
- case 'c':
case 'm':
+ if (MB_CUR_MAX > 1)
+ multibyte = 1;
+ /* FALLTHROUGH */
+ case 'c':
dochar = 1;
break;
case 'h':
@@ -112,15 +115,19 @@ main(int argc, char *argv[])
void
cnt(char *file)
{
+ static char *buf;
+ static ssize_t bufsz;
+
+ FILE *stream;
u_char *C;
short gotsp;
- int len;
+ ssize_t len;
int64_t linect, wordct, charct;
struct stat sbuf;
int fd;
- u_char buf[MAXBSIZE];
linect = wordct = charct = 0;
+ stream = NULL;
if (file) {
if ((fd = open(file, O_RDONLY, 0)) < 0) {
warn("%s", file);
@@ -131,7 +138,10 @@ cnt(char *file)
fd = STDIN_FILENO;
}
- if (!doword) {
+ if (!doword && !multibyte) {
+ if (bufsz < MAXBSIZE &&
+ (buf = realloc(buf, MAXBSIZE)) == NULL)
+ err(1, NULL);
/*
* Line counting is split out because it's a lot
* faster to get lines than to get words, since
@@ -178,16 +188,25 @@ cnt(char *file)
}
}
} else {
+ if (file == NULL)
+ stream = stdin;
+ else if ((stream = fdopen(fd, "r")) == NULL) {
+ warn("%s", file);
+ close(fd);
+ rval = 1;
+ return;
+ }
+
/* Do it the hard way... */
gotsp = 1;
- while ((len = read(fd, buf, MAXBSIZE)) > 0) {
- /*
- * This loses in the presence of multi-byte characters.
- * To do it right would require a function to return a
- * character while knowing how many bytes it consumed.
- */
- charct += len;
- for (C = buf; len--; ++C) {
+ while ((len = getline(&buf, &bufsz, stream)) > 0) {
+ for (C = buf; *C != '\0'; ++C) {
+ ++charct;
+ /*
+ * XXX For now, we don't want full
+ * Unicode support. Only treat ASCII
+ * whitespace as whitespace.
+ */
if (isspace(*C)) {
gotsp = 1;
if (*C == '\n')
@@ -205,10 +224,13 @@ cnt(char *file)
gotsp = 0;
++wordct;
}
+ if (multibyte &&
+ (len = mblen(C, MB_CUR_MAX)) > 1)
+ C += len - 1;
}
}
}
- if (len == -1) {
+ if (ferror(stream)) {
warn("%s", file);
rval = 1;
}
@@ -224,7 +246,7 @@ cnt(char *file)
twordct += wordct;
tcharct += charct;
- if (close(fd) != 0) {
+ if ((stream == NULL ? close(fd) : fclose(stream)) != 0) {
warn("%s", file);
rval = 1;
}