[CCing diffutils-devel.] Paul Eggert wrote in <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00021.html>: > > Level 3: Behave correctly. Don't split a 2-Unicode-character sequence. > > This is what code that uses mbrtoc32() does, when it has the > > lines > > if (bytes == (size_t) -3) > > bytes = 0; > > and uses !mbsinit (&state) in the loop termination condition. > > With diffutils even level 3 would not suffice, since diffutils truncates > at input byte boundaries, so it doesn't suffice to merely treat (size_t) > -3 as zero even if one also checks mbsinit. Instead, one would have to > treat all the characters in the sequence ABBB... (where A is an ordinary > multibyte character and the Bs all return (size_t) -3) as a single unit,
Yes. As far as I can see, this proposed patch should cope with (size_t) -3 returns correctly. The "trick" is to put the mbrtoc32 call into a do { ... mbrtoc32 ... } while (! mbsinit (&state)); loop. The patch is attached. Here's the "diff -w", for better readability. diff --git a/src/side.c b/src/side.c index 8404c3a..d5149de 100644 --- a/src/side.c +++ b/src/side.c @@ -136,34 +136,47 @@ print_half_line (char const *const *line, intmax_t indent, intmax_t out_bound) break; default: + /* Invariant: mbstate is in the initial state here. */ + do { char32_t wc; size_t bytes = mbrtoc32 (&wc, tp0, text_limit - tp0, &mbstate); - if (bytes <= MB_LEN_MAX) + if (bytes == (size_t) -1 || bytes == (size_t) -2) + { + /* An encoding error (bytes == (size_t) -1), as + (size_t) -2 cannot happen as the buffer ends in '\n'. */ + if (tp0 < text_limit) + { + /* Consume one byte. Assume it has print width 1. */ + if (ckd_add (&in_position, in_position, 1)) + return out_position; + if (in_position <= out_bound) + putc (*tp0, out); + tp0++; + } + memset (&mbstate, '\0', sizeof mbstate); + break; + } + else { int width = c32width (wc); if (0 < width && ckd_add (&in_position, in_position, width)) return out_position; + if (bytes == (size_t) -3) + bytes = 0; if (in_position <= out_bound) { out_position = in_position; fwrite (tp0, 1, bytes, out); } - text_pointer = tp0 + bytes; - - /* Resume scanning for single-byte characters, as - shift states are not supported. */ - break; + tp0 += bytes; } } - - /* An encoding error (bytes == (size_t) -1), - as (size_t) -2 cannot happen as the buffer ends in '\n', - and (size_t) -3 cannot happen on any known platform. - Reset, and assume the error has print width 1. */ - memset (&mbstate, 0, sizeof mbstate); - FALLTHROUGH; + while (! mbsinit (&mbstate)); + /* Invariant: mbstate is in the initial state here again. */ + text_pointer = tp0; + break; /* Print width 1. */ case ' ': case '!': case '"': case '#': case '%':
>From 9a7f1dc16cc7696a8a3ddbd09b33106cdb77d2a5 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Tue, 4 Jul 2023 21:24:59 +0200 Subject: [PATCH] diff: Improve handling of mbrtoc32 result * src/side.c (print_half_line): When mbrtoc32 has left the mbstate not in the initial state, continue calling mbrtoc32. --- src/side.c | 67 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/src/side.c b/src/side.c index 8404c3a..d5149de 100644 --- a/src/side.c +++ b/src/side.c @@ -136,34 +136,47 @@ print_half_line (char const *const *line, intmax_t indent, intmax_t out_bound) break; default: - { - char32_t wc; - size_t bytes = mbrtoc32 (&wc, tp0, text_limit - tp0, &mbstate); - - if (bytes <= MB_LEN_MAX) - { - int width = c32width (wc); - if (0 < width && ckd_add (&in_position, in_position, width)) - return out_position; - if (in_position <= out_bound) - { - out_position = in_position; - fwrite (tp0, 1, bytes, out); - } - text_pointer = tp0 + bytes; - - /* Resume scanning for single-byte characters, as - shift states are not supported. */ - break; - } - } + /* Invariant: mbstate is in the initial state here. */ + do + { + char32_t wc; + size_t bytes = mbrtoc32 (&wc, tp0, text_limit - tp0, &mbstate); - /* An encoding error (bytes == (size_t) -1), - as (size_t) -2 cannot happen as the buffer ends in '\n', - and (size_t) -3 cannot happen on any known platform. - Reset, and assume the error has print width 1. */ - memset (&mbstate, 0, sizeof mbstate); - FALLTHROUGH; + if (bytes == (size_t) -1 || bytes == (size_t) -2) + { + /* An encoding error (bytes == (size_t) -1), as + (size_t) -2 cannot happen as the buffer ends in '\n'. */ + if (tp0 < text_limit) + { + /* Consume one byte. Assume it has print width 1. */ + if (ckd_add (&in_position, in_position, 1)) + return out_position; + if (in_position <= out_bound) + putc (*tp0, out); + tp0++; + } + memset (&mbstate, '\0', sizeof mbstate); + break; + } + else + { + int width = c32width (wc); + if (0 < width && ckd_add (&in_position, in_position, width)) + return out_position; + if (bytes == (size_t) -3) + bytes = 0; + if (in_position <= out_bound) + { + out_position = in_position; + fwrite (tp0, 1, bytes, out); + } + tp0 += bytes; + } + } + while (! mbsinit (&mbstate)); + /* Invariant: mbstate is in the initial state here again. */ + text_pointer = tp0; + break; /* Print width 1. */ case ' ': case '!': case '"': case '#': case '%': -- 2.34.1