Hi,
Martijn van Duren wrote on Sat, Jan 08, 2022 at 08:30:20AM +0100:
> Why not go for the following diff?
> It has a comparable speed increase, but without the added complexity
> of a second inner loop.
Actually, i like the idea of not duplicating the loop, in cases where
that is possible without a noticeable performance cost.
I admit i OK'ed the original UTF-8 diff almost six years ago,
but looking at the code again, i now dislike how it is testing
every byte twice for no apparent reason. Even worse, i regard
the lack of I/O error checking on write operations as a bug.
Note that it does make sense to proceed to the next file on *read*
errors, whereas a *write* error should better be fatal.
For those reason, and because i prefer versions of isu8cont()
that do not inspect the locale, i propose the following patch
instead.
As an aside, i tried using fwrite(3) instead of the manual
putchar(*u) loop, but that is almost exactly as slow as repeatedly
calling MB_CUR_MAX, probably due to either locking or orientation
setting or some aspect of iov handling or buffering or more than
one of those; i refrained from profiling it.
----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
Index: rev.c
===================================================================
RCS file: /cvs/src/usr.bin/rev/rev.c,v
retrieving revision 1.13
diff -u -p -r1.13 rev.c
--- rev.c 10 Apr 2016 17:06:52 -0000 1.13
+++ rev.c 8 Jan 2022 23:19:46 -0000
@@ -46,13 +46,14 @@ void usage(void);
int
main(int argc, char *argv[])
{
- char *filename, *p = NULL, *t, *u;
+ char *filename, *p = NULL, *t, *te, *u;
FILE *fp;
ssize_t len;
size_t ps = 0;
- int ch, rval;
+ int ch, multibyte, rval;
setlocale(LC_CTYPE, "");
+ multibyte = MB_CUR_MAX > 1;
if (pledge("stdio rpath", NULL) == -1)
err(1, "pledge");
@@ -83,14 +84,16 @@ main(int argc, char *argv[])
if (p[len - 1] == '\n')
--len;
for (t = p + len - 1; t >= p; --t) {
- if (isu8cont(*t))
- continue;
- u = t;
- do {
- putchar(*u);
- } while (isu8cont(*(++u)));
+ te = t;
+ if (multibyte)
+ while (t > p && isu8cont(*t))
+ --t;
+ for (u = t; u <= te; ++u)
+ if (putchar(*u) == EOF)
+ err(1, "stdout");
}
- putchar('\n');
+ if (putchar('\n') == EOF)
+ err(1, "stdout");
}
if (ferror(fp)) {
warn("%s", filename);
@@ -104,7 +107,7 @@ main(int argc, char *argv[])
int
isu8cont(unsigned char c)
{
- return MB_CUR_MAX > 1 && (c & (0x80 | 0x40)) == 0x80;
+ return (c & (0x80 | 0x40)) == 0x80;
}
void
----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
The rest of this mail contains test reports.
These test reports use two test programs:
----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
/* makeutf8.c */
#include <err.h>
#include <errno.h>
#include <locale.h>
#include <stdio.h>
#include <wchar.h>
int
main(void)
{
FILE *in, *out;
char *line = NULL;
size_t linesize = 0;
unsigned int wc, wcl = 0;
if (setlocale(LC_CTYPE, "en_US.UTF-8") == NULL)
err(1, "setlocale");
if ((in = fopen("/usr/src/gnu/usr.bin/perl/lib/unicore/"
"UnicodeData.txt", "r")) == NULL)
err(1, "in");
if ((out = fopen("utf8.txt", "w")) == NULL)
err(1, "out");
while (getline(&line, &linesize, in) != -1) {
if (sscanf(line, "%x;", &wc) < 1)
err(1, "scanf: %s", line);
if (wc > wcl + 1)
if (fputwc(L'\n', out) == WEOF)
err(1, "write EOL");
if (fputwc(wc, out) == WEOF) {
if (errno == EILSEQ)
warn("write U+%04x", wc);
else
err(1, "write U+%04x", wc);
}
wcl = wc;
}
if (ferror(in))
errx(1, "read error");
if (fputwc(L'\n', out) == WEOF)
err(1, "write final EOL");
fclose(out);
fclose(in);
return 0;
}
----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
/* wrap.c */
#include <unistd.h>
int
main(int argc, char *argv[])
{
close(STDOUT_FILENO);
execl(argv[1], argv[1], "/usr/share/dict/words", NULL);
return 42;
}
----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
# Prepare for testing.
$ make makeutf8
cc -O2 -pipe -Werror-implicit-function-declaration -MD -MP \
-o makeutf8 /usr/src/usr.bin/rev/makeutf8.c
$ make wrap
cc -O2 -pipe -Werror-implicit-function-declaration -MD -MP \
-o wrap /usr/src/usr.bin/rev/wrap.c
$ make
cc -O2 -pipe -Werror-implicit-function-declaration -MD -MP \
-c /usr/src/usr.bin/rev/rev.c
cc -o rev rev.o
$ ./obj/makeutf8
makeutf8: write U+d800: Illegal byte sequence
makeutf8: write U+db7f: Illegal byte sequence
makeutf8: write U+db80: Illegal byte sequence
makeutf8: write U+dbff: Illegal byte sequence
makeutf8: write U+dc00: Illegal byte sequence
makeutf8: write U+dfff: Illegal byte sequence
$ wc utf8.txt
695 693 116982 utf8.txt
# Check that the current code works properly.
$ unset LC_ALL
$ export LC_CTYPE=C
$ /oldbin/rev /usr/share/dict/words > words_ocr.txt
$ /oldbin/rev words_ocr.txt > words_ocrr.txt
$ /oldbin/rev utf8.txt > utf8_ocr.txt
$ /oldbin/rev utf8_ocr.txt > utf8_ocrr.txt
$ export LC_CTYPE=en_US.UTF-8
$ /oldbin/rev /usr/share/dict/words > words_our.txt
$ /oldbin/rev words_our.txt > words_ourr.txt
$ /oldbin/rev utf8.txt > utf8_our.txt
$ /oldbin/rev utf8_our.txt > utf8_ourr.txt
$ diff -u /usr/share/dict/words words_ocrr.txt
$ diff -u /usr/share/dict/words words_ourr.txt
$ diff -u words_ocr.txt words_our.txt
$ diff -u utf8.txt utf8_ocrr.txt
$ diff -u utf8.txt utf8_ourr.txt
$ cmp utf8_ocr.txt utf8_our.txt
utf8_ocr.txt utf8_our.txt differ: char 12, line 2
# Check that the new code works properly.
$ export LC_CTYPE=C
$ ./obj/rev /usr/share/dict/words > words_ncr.txt
$ ./obj/rev utf8.txt > utf8_ncr.txt
$ export LC_CTYPE=en_US.UTF-8
$ ./obj/rev /usr/share/dict/words > words_nur.txt
$ ./obj/rev utf8.txt > utf8_nur.txt
$ diff -u words_ocr.txt words_ncr.txt
$ diff -u words_our.txt words_nur.txt
$ diff -u utf8_ocr.txt utf8_ncr.txt
$ diff -u utf8_our.txt utf8_nur.txt
$
# Demonstate broken error handling in the old code.
$ ./obj/wrap /obin/rev
$ echo $?
42
# Demonstrate working error handling in the new code.
$ ./obj/wrap ./obj/rev
rev: stdout: Bad file descriptor
$ echo $?
1
# Check performance.
$ export LC_CTYPE=C
$ for i in $(jot 10); do time /oldbin/rev words_ocr.txt > /dev/null; done
0m00.09s real 0m00.10s user 0m00.01s system
0m00.09s real 0m00.10s user 0m00.00s system
0m00.09s real 0m00.10s user 0m00.01s system
0m00.09s real 0m00.10s user 0m00.00s system
0m00.09s real 0m00.11s user 0m00.00s system
0m00.09s real 0m00.10s user 0m00.00s system
0m00.09s real 0m00.09s user 0m00.01s system
0m00.09s real 0m00.09s user 0m00.01s system
0m00.09s real 0m00.08s user 0m00.01s system
0m00.09s real 0m00.09s user 0m00.00s system
$ for i in $(jot 10); do time ./obj/rev words_ocr.txt > /dev/null; done
0m00.02s real 0m00.03s user 0m00.00s system
0m00.02s real 0m00.03s user 0m00.00s system
0m00.02s real 0m00.02s user 0m00.00s system
0m00.02s real 0m00.02s user 0m00.00s system
0m00.02s real 0m00.02s user 0m00.02s system
0m00.02s real 0m00.03s user 0m00.00s system
0m00.02s real 0m00.02s user 0m00.00s system
0m00.02s real 0m00.02s user 0m00.00s system
0m00.02s real 0m00.03s user 0m00.00s system
0m00.02s real 0m00.03s user 0m00.00s system
$ export LC_CTYPE=en_US.UTF-8
$ for i in $(jot 10); do time /oldbin/rev words_ocr.txt > /dev/null; done
0m00.09s real 0m00.10s user 0m00.00s system
0m00.10s real 0m00.12s user 0m00.00s system
0m00.09s real 0m00.10s user 0m00.00s system
0m00.09s real 0m00.10s user 0m00.00s system
0m00.09s real 0m00.09s user 0m00.01s system
0m00.09s real 0m00.10s user 0m00.00s system
0m00.09s real 0m00.10s user 0m00.00s system
0m00.09s real 0m00.08s user 0m00.01s system
0m00.09s real 0m00.09s user 0m00.00s system
0m00.09s real 0m00.09s user 0m00.00s system
$ for i in $(jot 10); do time ./obj/rev words_ocr.txt > /dev/null; done
0m00.03s real 0m00.02s user 0m00.01s system
0m00.03s real 0m00.02s user 0m00.00s system
0m00.03s real 0m00.03s user 0m00.00s system
0m00.03s real 0m00.03s user 0m00.00s system
0m00.03s real 0m00.03s user 0m00.00s system
0m00.03s real 0m00.03s user 0m00.00s system
0m00.03s real 0m00.02s user 0m00.01s system
0m00.03s real 0m00.03s user 0m00.00s system
0m00.03s real 0m00.02s user 0m00.01s system
0m00.03s real 0m00.03s user 0m00.00s system
# Regression tests.
$ doas make install
$ cd /usr/src/regress/usr.bin/rev/
$ make clean
rm -f a.out [Ee]rrs mklog *.core y.tab.h out.ascii.txt out.utf8.txt
$ make regress ; echo $?
0