Paul Eggert wrote:
> >> * What happens when strings contain encoding errors? It's not clear from
> >> the spec. I hope behavior isn't simply undefined.
> > 
> > When the str_* functions are used, the byte-wise encoding will matter.
> 
> I thought that str_* functions didn't care about locale, which means the 
> character encoding does not matter for them.

Yes, that's what I meant. Sorry for the misunderstanding.

> > When the mbiter primitives are used, recall that they cope with
> > encoding errors (via the 'bool cur.wc_valid'); thus I expect that
> > encoding errors in the range of the suffix will match if it's the
> > same encoding error in both argument strings.
> 
> Don't we have problems with mbs_startswith, though? If the prefix ends 
> in an incomplete multibyte character (an encoding error), ...

Oh, I now see what you mean.

mbs_endswith needs a tweak to support that case; patch below.

mbs_startswith needs code with mbiter as well, for this case. Still working
on that...


2025-01-04  Bruno Haible  <br...@clisp.org>

        mbs_endswith: Fix abort in the case of incomplete characters.
        Reported by Paul Eggert.
        * lib/mbs_endswith.c: Don't include <stdlib.h>.
        (mbs_endswith): Instead of aborting, return false.
        * tests/test-mbs_endswith2.c (main): Test invalid and incomplete
        characters.

diff --git a/lib/mbs_endswith.c b/lib/mbs_endswith.c
index 8f163e1aa5..00d5128d43 100644
--- a/lib/mbs_endswith.c
+++ b/lib/mbs_endswith.c
@@ -23,8 +23,6 @@
 
 #include "mbiter.h"
 
-#include <stdlib.h>
-
 bool
 mbs_endswith (const char *string, const char *suffix)
 {
@@ -62,13 +60,15 @@ mbs_endswith (const char *string, const char *suffix)
           for (; len > n; len--)
             {
               if (!mbi_avail (iter))
-                abort ();
+                /* We can get here due to incomplete multibyte characters.  */
+                return false;
               mbi_advance (iter);
             }
           if (!mbi_avail (iter))
-            abort ();
+            /* We can get here due to incomplete multibyte characters.  */
+            return false;
           return strcmp (mbi_cur_ptr (iter), suffix) == 0;
         }
     }
-  return 0;
+  return false;
 }
diff --git a/tests/test-mbs_endswith2.c b/tests/test-mbs_endswith2.c
index f76f9b04e4..35c3b8652d 100644
--- a/tests/test-mbs_endswith2.c
+++ b/tests/test-mbs_endswith2.c
@@ -62,5 +62,29 @@ main ()
   ASSERT (mbs_endswith ("\341\272\213\303\277\341\272\221", 
"\341\272\213\303\277\341\272\221")); /* "ẋÿẑ" "ẋÿẑ" */
   ASSERT (mbs_endswith 
("\303\277\341\272\213\341\272\213\303\277\341\272\221", 
"\341\272\213\303\277\341\272\221")); /* "ÿẋẋÿẑ" "ẋÿẑ" */
 
+  /* Test cases with invalid or incomplete characters.  */
+
+  /* A valid character should not match an invalid character.  */
+  ASSERT (!mbs_endswith ("\303\247", "\301\247"));
+  ASSERT (!mbs_endswith ("\301\247", "\303\247"));
+
+  /* A valid character should not match an incomplete character.  */
+  ASSERT (!mbs_endswith ("\303\247", "\343\247"));
+  ASSERT (!mbs_endswith ("\343\247", "\303\247"));
+
+  /* An invalid character should not match an incomplete character.  */
+  ASSERT (!mbs_endswith ("\301\247", "\343\247"));
+  ASSERT (!mbs_endswith ("\343\247", "\301\247"));
+
+  /* Two invalid characters should match only if they are identical.  */
+  ASSERT (!mbs_endswith ("\301\246", "\301\247"));
+  ASSERT (!mbs_endswith ("\301\247", "\301\246"));
+  ASSERT (mbs_endswith ("\301\247", "\301\247"));
+
+  /* Two incomplete characters should match only if they are identical.  */
+  ASSERT (!mbs_endswith ("\343\246", "\343\247"));
+  ASSERT (!mbs_endswith ("\343\247", "\343\246"));
+  ASSERT (mbs_endswith ("\343\247", "\343\247"));
+
   return test_exit_status;
 }




Reply via email to