[CCing diffutils-devel.]

Paul Eggert wrote in
<https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00021.html>:
> >    Level 3: Behave correctly. Don't split a 2-Unicode-character sequence.
> >             This is what code that uses mbrtoc32() does, when it has the
> >             lines
> >                  if (bytes == (size_t) -3)
> >                    bytes = 0;
> >             and uses !mbsinit (&state) in the loop termination condition.
> 
> With diffutils even level 3 would not suffice, since diffutils truncates 
> at input byte boundaries, so it doesn't suffice to merely treat (size_t) 
> -3 as zero even if one also checks mbsinit. Instead, one would have to 
> treat all the characters in the sequence ABBB... (where A is an ordinary 
> multibyte character and the Bs all return (size_t) -3) as a single unit, 

Yes.

As far as I can see, this proposed patch should cope with (size_t) -3
returns correctly. The "trick" is to put the mbrtoc32 call into a

   do
     {
       ... mbrtoc32 ...

     }
   while (! mbsinit (&state));

loop.


The patch is attached. Here's the "diff -w", for better readability.

diff --git a/src/side.c b/src/side.c
index 8404c3a..d5149de 100644
--- a/src/side.c
+++ b/src/side.c
@@ -136,34 +136,47 @@ print_half_line (char const *const *line, intmax_t 
indent, intmax_t out_bound)
           break;
 
         default:
+          /* Invariant: mbstate is in the initial state here.  */
+          do
             {
               char32_t wc;
               size_t bytes = mbrtoc32 (&wc, tp0, text_limit - tp0, &mbstate);
 
-           if (bytes <= MB_LEN_MAX)
+              if (bytes == (size_t) -1 || bytes == (size_t) -2)
+                {
+                  /* An encoding error (bytes == (size_t) -1), as
+                     (size_t) -2 cannot happen as the buffer ends in '\n'.  */
+                  if (tp0 < text_limit)
+                    {
+                      /* Consume one byte.  Assume it has print width 1.  */
+                      if (ckd_add (&in_position, in_position, 1))
+                        return out_position;
+                      if (in_position <= out_bound)
+                        putc (*tp0, out);
+                      tp0++;
+                    }
+                  memset (&mbstate, '\0', sizeof mbstate);
+                  break;
+                }
+              else
                 {
                   int width = c32width (wc);
                   if (0 < width && ckd_add (&in_position, in_position, width))
                     return out_position;
+                  if (bytes == (size_t) -3)
+                    bytes = 0;
                   if (in_position <= out_bound)
                     {
                       out_position = in_position;
                       fwrite (tp0, 1, bytes, out);
                     }
-                text_pointer = tp0 + bytes;
-
-               /* Resume scanning for single-byte characters, as
-                  shift states are not supported.  */
-                break;
+                  tp0 += bytes;
                 }
             }
-
-         /* An encoding error (bytes == (size_t) -1),
-            as (size_t) -2 cannot happen as the buffer ends in '\n',
-            and (size_t) -3 cannot happen on any known platform.
-            Reset, and assume the error has print width 1.  */
-         memset (&mbstate, 0, sizeof mbstate);
-          FALLTHROUGH;
+          while (! mbsinit (&mbstate));
+          /* Invariant: mbstate is in the initial state here again.  */
+          text_pointer = tp0;
+          break;
 
         /* Print width 1.  */
         case ' ': case '!': case '"': case '#': case '%':
>From 9a7f1dc16cc7696a8a3ddbd09b33106cdb77d2a5 Mon Sep 17 00:00:00 2001
From: Bruno Haible <br...@clisp.org>
Date: Tue, 4 Jul 2023 21:24:59 +0200
Subject: [PATCH] diff: Improve handling of mbrtoc32 result

* src/side.c (print_half_line): When mbrtoc32 has left the mbstate not
in the initial state, continue calling mbrtoc32.
---
 src/side.c | 67 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 27 deletions(-)

diff --git a/src/side.c b/src/side.c
index 8404c3a..d5149de 100644
--- a/src/side.c
+++ b/src/side.c
@@ -136,34 +136,47 @@ print_half_line (char const *const *line, intmax_t indent, intmax_t out_bound)
           break;
 
         default:
-          {
-            char32_t wc;
-            size_t bytes = mbrtoc32 (&wc, tp0, text_limit - tp0, &mbstate);
-
-	    if (bytes <= MB_LEN_MAX)
-              {
-                int width = c32width (wc);
-		if (0 < width && ckd_add (&in_position, in_position, width))
-		  return out_position;
-                if (in_position <= out_bound)
-                  {
-                    out_position = in_position;
-                    fwrite (tp0, 1, bytes, out);
-                  }
-                text_pointer = tp0 + bytes;
-
-		/* Resume scanning for single-byte characters, as
-		   shift states are not supported.  */
-                break;
-              }
-          }
+          /* Invariant: mbstate is in the initial state here.  */
+          do
+            {
+              char32_t wc;
+              size_t bytes = mbrtoc32 (&wc, tp0, text_limit - tp0, &mbstate);
 
-	  /* An encoding error (bytes == (size_t) -1),
-	     as (size_t) -2 cannot happen as the buffer ends in '\n',
-	     and (size_t) -3 cannot happen on any known platform.
-	     Reset, and assume the error has print width 1.  */
-	  memset (&mbstate, 0, sizeof mbstate);
-          FALLTHROUGH;
+              if (bytes == (size_t) -1 || bytes == (size_t) -2)
+                {
+                  /* An encoding error (bytes == (size_t) -1), as
+                     (size_t) -2 cannot happen as the buffer ends in '\n'.  */
+                  if (tp0 < text_limit)
+                    {
+                      /* Consume one byte.  Assume it has print width 1.  */
+                      if (ckd_add (&in_position, in_position, 1))
+                        return out_position;
+                      if (in_position <= out_bound)
+                        putc (*tp0, out);
+                      tp0++;
+                    }
+                  memset (&mbstate, '\0', sizeof mbstate);
+                  break;
+                }
+              else
+                {
+                  int width = c32width (wc);
+                  if (0 < width && ckd_add (&in_position, in_position, width))
+                    return out_position;
+                  if (bytes == (size_t) -3)
+                    bytes = 0;
+                  if (in_position <= out_bound)
+                    {
+                      out_position = in_position;
+                      fwrite (tp0, 1, bytes, out);
+                    }
+                  tp0 += bytes;
+                }
+            }
+          while (! mbsinit (&mbstate));
+          /* Invariant: mbstate is in the initial state here again.  */
+          text_pointer = tp0;
+          break;
 
         /* Print width 1.  */
         case ' ': case '!': case '"': case '#': case '%':
-- 
2.34.1

Reply via email to