unistr/u8-*: Make Unicode decoder more Unicode Standard compliant

Bruno Haible Tue, 25 Jul 2023 13:36:32 -0700

This patch makes gnulib's and libunistring's UTF-8 decoder (mainly
u8_mbtouc) more Unicode Standard compliant, regarding
https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf section 3.9.



2023-07-25  Bruno Haible  <br...@clisp.org>

        unistr/u8-*: Make Unicode decoder more Unicode Standard compliant.
        Based on a remark by Paul Eggert in
        <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00120.html>.
        * tests/unistr/test-u8-mbtouc.c (test_safe_function): Change expected
        results for "non-shortest form" or out-of-range byte sequences. Add new
        test cases of incomplete well-formed byte sequences.
        * tests/unistr/test-u8-mbsnlen.c (main): Likewise.
        * lib/unistr/u8-mbtouc-aux.c (u8_mbtouc_aux): Reject a first byte in the
        range 0xF5..0xF7 as invalid. Distinguish incomplete from invalid byte
        sequences correctly. For the former, return only the number of bytes in
        the maximal well-formed subpart.
        * lib/unistr/u8-mbtouc.c (u8_mbtouc): Likewise.
        * lib/unistr/u8-check.c (u8_check): Reject a first byte in the range
        0xF5..0xF7 as invalid.
        * lib/unistr/u8-mblen.c (u8_mblen): Likewise.
        * lib/unistr/u8-mbtoucr.c (u8_mbtoucr): Likewise.
        * lib/unistr/u8-strmbtouc.c (u8_strmbtouc): Likewise.
        * lib/unistr/u8-strmblen.c (u8_strmblen): Likewise.
        * lib/unistr/u8-prev.c (u8_prev): Likewise.

diff --git a/lib/unistr/u8-check.c b/lib/unistr/u8-check.c
index 2f03cd9af0..53217006ea 100644
--- a/lib/unistr/u8-check.c
+++ b/lib/unistr/u8-check.c
@@ -57,13 +57,13 @@ u8_check (const uint8_t *s, size_t n)
                   continue;
                 }
             }
-          else if (c < 0xf8)
+          else if (c <= 0xf4)
             {
               if (s + 4 <= s_end
                   && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
                   && (s[3] ^ 0x80) < 0x40
                   && (c >= 0xf1 || s[1] >= 0x90)
-                  && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
+                  && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
                 {
                   s += 4;
                   continue;
diff --git a/lib/unistr/u8-mblen.c b/lib/unistr/u8-mblen.c
index a5f88dedef..d989afc437 100644
--- a/lib/unistr/u8-mblen.c
+++ b/lib/unistr/u8-mblen.c
@@ -47,13 +47,13 @@ u8_mblen (const uint8_t *s, size_t n)
                   && (c != 0xed || s[1] < 0xa0))
                 return 3;
             }
-          else if (c < 0xf8)
+          else if (c <= 0xf4)
             {
               if (n >= 4
                   && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
                   && (s[3] ^ 0x80) < 0x40
                   && (c >= 0xf1 || s[1] >= 0x90)
-                  && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
+                  && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
                 return 4;
             }
         }
diff --git a/lib/unistr/u8-mbtouc-aux.c b/lib/unistr/u8-mbtouc-aux.c
index a6b7edcfb9..15568c3bc8 100644
--- a/lib/unistr/u8-mbtouc-aux.c
+++ b/lib/unistr/u8-mbtouc-aux.c
@@ -52,20 +52,15 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n)
         {
           if (n >= 3)
             {
-              if ((s[1] ^ 0x80) < 0x40)
+              if ((s[1] ^ 0x80) < 0x40
+                  && (c >= 0xe1 || s[1] >= 0xa0)
+                  && (c != 0xed || s[1] < 0xa0))
                 {
                   if ((s[2] ^ 0x80) < 0x40)
                     {
-                      if ((c >= 0xe1 || s[1] >= 0xa0)
-                          && (c != 0xed || s[1] < 0xa0))
-                        {
-                          *puc = ((unsigned int) (c & 0x0f) << 12)
-                                 | ((unsigned int) (s[1] ^ 0x80) << 6)
-                                 | (unsigned int) (s[2] ^ 0x80);
-                          return 3;
-                        }
-                      /* invalid multibyte character */
-                      *puc = 0xfffd;
+                      *puc = ((unsigned int) (c & 0x0f) << 12)
+                             | ((unsigned int) (s[1] ^ 0x80) << 6)
+                             | (unsigned int) (s[2] ^ 0x80);
                       return 3;
                     }
                   /* invalid multibyte character */
@@ -73,38 +68,50 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n)
                   return 2;
                 }
               /* invalid multibyte character */
+              *puc = 0xfffd;
+              return 1;
             }
           else
             {
-              /* incomplete multibyte character */
               *puc = 0xfffd;
-              if (n == 1 || (s[1] ^ 0x80) >= 0x40)
-                return 1;
+              if (n == 1)
+                {
+                  /* incomplete multibyte character */
+                  return 1;
+                }
               else
-                return 2;
+                {
+                  if ((s[1] ^ 0x80) < 0x40
+                      && (c >= 0xe1 || s[1] >= 0xa0)
+                      && (c != 0xed || s[1] < 0xa0))
+                    {
+                      /* incomplete multibyte character */
+                      return 2;
+                    }
+                  else
+                    {
+                      /* invalid multibyte character */
+                      return 1;
+                    }
+                }
             }
         }
-      else if (c < 0xf8)
+      else if (c <= 0xf4)
         {
           if (n >= 4)
             {
-              if ((s[1] ^ 0x80) < 0x40)
+              if ((s[1] ^ 0x80) < 0x40
+                  && (c >= 0xf1 || s[1] >= 0x90)
+                  && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
                 {
                   if ((s[2] ^ 0x80) < 0x40)
                     {
                       if ((s[3] ^ 0x80) < 0x40)
                         {
-                          if ((c >= 0xf1 || s[1] >= 0x90)
-                              && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
-                            {
-                              *puc = ((unsigned int) (c & 0x07) << 18)
-                                     | ((unsigned int) (s[1] ^ 0x80) << 12)
-                                     | ((unsigned int) (s[2] ^ 0x80) << 6)
-                                     | (unsigned int) (s[3] ^ 0x80);
-                              return 4;
-                            }
-                          /* invalid multibyte character */
-                          *puc = 0xfffd;
+                          *puc = ((unsigned int) (c & 0x07) << 18)
+                                 | ((unsigned int) (s[1] ^ 0x80) << 12)
+                                 | ((unsigned int) (s[2] ^ 0x80) << 6)
+                                 | (unsigned int) (s[3] ^ 0x80);
                           return 4;
                         }
                       /* invalid multibyte character */
@@ -116,17 +123,48 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n)
                   return 2;
                 }
               /* invalid multibyte character */
+              *puc = 0xfffd;
+              return 1;
             }
           else
             {
-              /* incomplete multibyte character */
               *puc = 0xfffd;
-              if (n == 1 || (s[1] ^ 0x80) >= 0x40)
-                return 1;
-              else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
-                return 2;
+              if (n == 1)
+                {
+                  /* incomplete multibyte character */
+                  return 1;
+                }
               else
-                return 3;
+                {
+                  if ((s[1] ^ 0x80) < 0x40
+                      && (c >= 0xf1 || s[1] >= 0x90)
+                      && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
+                    {
+                      if (n == 2)
+                        {
+                          /* incomplete multibyte character */
+                          return 2;
+                        }
+                      else
+                        {
+                          if ((s[2] ^ 0x80) < 0x40)
+                            {
+                              /* incomplete multibyte character */
+                              return 3;
+                            }
+                          else
+                            {
+                              /* invalid multibyte character */
+                              return 2;
+                            }
+                        }
+                    }
+                  else
+                    {
+                      /* invalid multibyte character */
+                      return 1;
+                    }
+                }
             }
         }
     }
diff --git a/lib/unistr/u8-mbtouc.c b/lib/unistr/u8-mbtouc.c
index e30e5203c1..920ad6f558 100644
--- a/lib/unistr/u8-mbtouc.c
+++ b/lib/unistr/u8-mbtouc.c
@@ -62,20 +62,15 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n)
         {
           if (n >= 3)
             {
-              if ((s[1] ^ 0x80) < 0x40)
+              if ((s[1] ^ 0x80) < 0x40
+                  && (c >= 0xe1 || s[1] >= 0xa0)
+                  && (c != 0xed || s[1] < 0xa0))
                 {
                   if ((s[2] ^ 0x80) < 0x40)
                     {
-                      if ((c >= 0xe1 || s[1] >= 0xa0)
-                          && (c != 0xed || s[1] < 0xa0))
-                        {
-                          *puc = ((unsigned int) (c & 0x0f) << 12)
-                                 | ((unsigned int) (s[1] ^ 0x80) << 6)
-                                 | (unsigned int) (s[2] ^ 0x80);
-                          return 3;
-                        }
-                      /* invalid multibyte character */
-                      *puc = 0xfffd;
+                      *puc = ((unsigned int) (c & 0x0f) << 12)
+                             | ((unsigned int) (s[1] ^ 0x80) << 6)
+                             | (unsigned int) (s[2] ^ 0x80);
                       return 3;
                     }
                   /* invalid multibyte character */
@@ -83,38 +78,50 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n)
                   return 2;
                 }
               /* invalid multibyte character */
+              *puc = 0xfffd;
+              return 1;
             }
           else
             {
-              /* incomplete multibyte character */
               *puc = 0xfffd;
-              if (n == 1 || (s[1] ^ 0x80) >= 0x40)
-                return 1;
+              if (n == 1)
+                {
+                  /* incomplete multibyte character */
+                  return 1;
+                }
               else
-                return 2;
+                {
+                  if ((s[1] ^ 0x80) < 0x40
+                      && (c >= 0xe1 || s[1] >= 0xa0)
+                      && (c != 0xed || s[1] < 0xa0))
+                    {
+                      /* incomplete multibyte character */
+                      return 2;
+                    }
+                  else
+                    {
+                      /* invalid multibyte character */
+                      return 1;
+                    }
+                }
             }
         }
-      else if (c < 0xf8)
+      else if (c <= 0xf4)
         {
           if (n >= 4)
             {
-              if ((s[1] ^ 0x80) < 0x40)
+              if ((s[1] ^ 0x80) < 0x40
+                  && (c >= 0xf1 || s[1] >= 0x90)
+                  && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
                 {
                   if ((s[2] ^ 0x80) < 0x40)
                     {
                       if ((s[3] ^ 0x80) < 0x40)
                         {
-                          if ((c >= 0xf1 || s[1] >= 0x90)
-                              && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
-                            {
-                              *puc = ((unsigned int) (c & 0x07) << 18)
-                                     | ((unsigned int) (s[1] ^ 0x80) << 12)
-                                     | ((unsigned int) (s[2] ^ 0x80) << 6)
-                                     | (unsigned int) (s[3] ^ 0x80);
-                              return 4;
-                            }
-                          /* invalid multibyte character */
-                          *puc = 0xfffd;
+                          *puc = ((unsigned int) (c & 0x07) << 18)
+                                 | ((unsigned int) (s[1] ^ 0x80) << 12)
+                                 | ((unsigned int) (s[2] ^ 0x80) << 6)
+                                 | (unsigned int) (s[3] ^ 0x80);
                           return 4;
                         }
                       /* invalid multibyte character */
@@ -126,17 +133,48 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n)
                   return 2;
                 }
               /* invalid multibyte character */
+              *puc = 0xfffd;
+              return 1;
             }
           else
             {
-              /* incomplete multibyte character */
               *puc = 0xfffd;
-              if (n == 1 || (s[1] ^ 0x80) >= 0x40)
-                return 1;
-              else if (n == 2 || (s[2] ^ 0x80) >= 0x40)
-                return 2;
+              if (n == 1)
+                {
+                  /* incomplete multibyte character */
+                  return 1;
+                }
               else
-                return 3;
+                {
+                  if ((s[1] ^ 0x80) < 0x40
+                      && (c >= 0xf1 || s[1] >= 0x90)
+                      && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
+                    {
+                      if (n == 2)
+                        {
+                          /* incomplete multibyte character */
+                          return 2;
+                        }
+                      else
+                        {
+                          if ((s[2] ^ 0x80) < 0x40)
+                            {
+                              /* incomplete multibyte character */
+                              return 3;
+                            }
+                          else
+                            {
+                              /* invalid multibyte character */
+                              return 2;
+                            }
+                        }
+                    }
+                  else
+                    {
+                      /* invalid multibyte character */
+                      return 1;
+                    }
+                }
             }
         }
     }
diff --git a/lib/unistr/u8-mbtoucr.c b/lib/unistr/u8-mbtoucr.c
index d09051128f..296062d233 100644
--- a/lib/unistr/u8-mbtoucr.c
+++ b/lib/unistr/u8-mbtoucr.c
@@ -86,13 +86,13 @@ u8_mbtoucr (ucs4_t *puc, const uint8_t *s, size_t n)
               return -2;
             }
         }
-      else if (c < 0xf8)
+      else if (c <= 0xf4)
         {
           if (n >= 2)
             {
               if ((s[1] ^ 0x80) < 0x40
                   && (c >= 0xf1 || s[1] >= 0x90)
-                  && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
+                  && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
                 {
                   if (n >= 3)
                     {
diff --git a/lib/unistr/u8-prev.c b/lib/unistr/u8-prev.c
index 1012486b36..ad8a347c19 100644
--- a/lib/unistr/u8-prev.c
+++ b/lib/unistr/u8-prev.c
@@ -63,9 +63,9 @@ u8_prev (ucs4_t *puc, const uint8_t *s, const uint8_t *start)
                       {
                         uint8_t c_4 = s[-4];
 
-                        if (c_4 >= 0xf0 && c_4 < 0xf8
+                        if (c_4 >= 0xf0 && c_4 <= 0xf4
                             && (c_4 >= 0xf1 || c_3 >= 0x90)
-                            && (c_4 < 0xf4 || (c_4 == 0xf4 && c_3 < 0x90)))
+                            && (c_4 < 0xf4 || (/* c_4 == 0xf4 && */ c_3 < 
0x90)))
                           {
                             *puc = ((unsigned int) (c_4 & 0x07) << 18)
                                    | ((unsigned int) (c_3 ^ 0x80) << 12)
diff --git a/lib/unistr/u8-strmblen.c b/lib/unistr/u8-strmblen.c
index 558771341a..a34a01fc14 100644
--- a/lib/unistr/u8-strmblen.c
+++ b/lib/unistr/u8-strmblen.c
@@ -51,12 +51,12 @@ u8_strmblen (const uint8_t *s)
               && (c != 0xed || s[1] < 0xa0))
             return 3;
         }
-      else if (c < 0xf8)
+      else if (c <= 0xf4)
         {
           if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
               && (s[3] ^ 0x80) < 0x40
               && (c >= 0xf1 || s[1] >= 0x90)
-              && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
+              && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
             return 4;
         }
     }
diff --git a/lib/unistr/u8-strmbtouc.c b/lib/unistr/u8-strmbtouc.c
index a47fbbb84f..259d3c2f37 100644
--- a/lib/unistr/u8-strmbtouc.c
+++ b/lib/unistr/u8-strmbtouc.c
@@ -63,12 +63,12 @@ u8_strmbtouc (ucs4_t *puc, const uint8_t *s)
               return 3;
             }
         }
-      else if (c < 0xf8)
+      else if (c <= 0xf4)
         {
           if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
               && (s[3] ^ 0x80) < 0x40
               && (c >= 0xf1 || s[1] >= 0x90)
-              && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
+              && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
             {
               *puc = ((unsigned int) (c & 0x07) << 18)
                      | ((unsigned int) (s[1] ^ 0x80) << 12)
diff --git a/tests/unistr/test-u8-mbsnlen.c b/tests/unistr/test-u8-mbsnlen.c
index c0b9b6e3f1..67b80d02a7 100644
--- a/tests/unistr/test-u8-mbsnlen.c
+++ b/tests/unistr/test-u8-mbsnlen.c
@@ -61,9 +61,18 @@ main ()
      that a "malformed sequence" is interpreted in the same way as
      "a character that is outside the adopted subset".
      Reference:
+       ISO 10646-1 amendment 2
+       <https://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
        Markus Kuhn: UTF-8 decoder capability and stress test
        <https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
        <https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html>
+     These old specifications (from ca. 2000) were a bit ambiguous, and the
+     definition of UTF-8 has changed a bit as well.  The newer specification
+     we obey is the Unicode Standard, version 15.
+     Reference:
+       Unicode Standard 15.0.0, section 3.9
+       <https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf>
+       pages 124..129, especially table 3-7.
    */
   /* 3.1. Test that each unexpected continuation byte is signalled as a
      malformed sequence of its own.  */
@@ -97,9 +106,14 @@ main ()
   }
   /* 3.3.2. 3-byte sequence with last byte missing.  */
   {
-    static const uint8_t input[] = { '"', 0xE0, 0x80, '"' };
+    static const uint8_t input[] = { '"', 0xE0, 0xA0, '"' };
     ASSERT (u8_mbsnlen (input, 4) == 3);
   }
+  {
+    /* Outdated example: 0xE0 0x80 is an ill-formed sequence.  */
+    static const uint8_t input[] = { '"', 0xE0, 0x80, '"' };
+    ASSERT (u8_mbsnlen (input, 4) == 4);
+  }
   /* 3.3.7. 3-byte sequence with last byte missing.  */
   {
     static const uint8_t input[] = { '"', 0xEF, 0xBF, '"' };
@@ -107,14 +121,24 @@ main ()
   }
   /* 3.3.3. 4-byte sequence with last byte missing.  */
   {
-    static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' };
+    static const uint8_t input[] = { '"', 0xF0, 0x90, 0x80, '"' };
     ASSERT (u8_mbsnlen (input, 5) == 3);
   }
+  {
+    /* Outdated example: 0xF0 0x80 is an ill-formed sequence.  */
+    static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' };
+    ASSERT (u8_mbsnlen (input, 5) == 5);
+  }
   /* 3.3.8. 4-byte sequence with last byte missing.  */
   {
-    static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' };
+    static const uint8_t input[] = { '"', 0xF3, 0xBF, 0xBF, '"' };
     ASSERT (u8_mbsnlen (input, 5) == 3);
   }
+  {
+    /* Outdated example: 0xF7 is an invalid first byte.  */
+    static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' };
+    ASSERT (u8_mbsnlen (input, 5) == 5);
+  }
 
   return 0;
 }
diff --git a/tests/unistr/test-u8-mbtouc.c b/tests/unistr/test-u8-mbtouc.c
index 35c70c2193..a695ba1c70 100644
--- a/tests/unistr/test-u8-mbtouc.c
+++ b/tests/unistr/test-u8-mbtouc.c
@@ -34,9 +34,18 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const 
uint8_t *, size_t))
      that a "malformed sequence" is interpreted in the same way as
      "a character that is outside the adopted subset".
      Reference:
+       ISO 10646-1 amendment 2
+       <https://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
        Markus Kuhn: UTF-8 decoder capability and stress test
        <https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
        <https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html>
+     These old specifications (from ca. 2000) were a bit ambiguous, and the
+     definition of UTF-8 has changed a bit as well.  The newer specification
+     we obey is the Unicode Standard, version 15.
+     Reference:
+       Unicode Standard 15.0.0, section 3.9
+       <https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf>
+       pages 124..129, especially table 3-7.
    */
   /* 3.1. Test that each unexpected continuation byte is signalled as a
      malformed sequence of its own.  */
@@ -118,7 +127,7 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const 
uint8_t *, size_t))
   }
   /* 3.3.2. 3-byte sequence with last byte missing.  */
   {
-    static const uint8_t input[] = { '"', 0xE0, 0x80, '"' };
+    static const uint8_t input[] = { '"', 0xE0, 0xA0, '"' };
     uc = 0xBADFACE;
     ret = my_u8_mbtouc (&uc, input, 4);
     ASSERT (ret == 1);
@@ -132,6 +141,26 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const 
uint8_t *, size_t))
     ASSERT (ret == 1);
     ASSERT (uc == 0x0022);
   }
+  {
+    /* Outdated example: 0xE0 0x80 is an ill-formed sequence.  */
+    static const uint8_t input[] = { '"', 0xE0, 0x80, '"' };
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input, 4);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0x0022);
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input + 1, 3);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0xFFFD);
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input + 2, 2);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0xFFFD);
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input + 3, 1);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0x0022);
+  }
   /* 3.3.7. 3-byte sequence with last byte missing.  */
   {
     static const uint8_t input[] = { '"', 0xEF, 0xBF, '"' };
@@ -150,7 +179,7 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const 
uint8_t *, size_t))
   }
   /* 3.3.3. 4-byte sequence with last byte missing.  */
   {
-    static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' };
+    static const uint8_t input[] = { '"', 0xF0, 0x90, 0x80, '"' };
     uc = 0xBADFACE;
     ret = my_u8_mbtouc (&uc, input, 5);
     ASSERT (ret == 1);
@@ -164,9 +193,33 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const 
uint8_t *, size_t))
     ASSERT (ret == 1);
     ASSERT (uc == 0x0022);
   }
+  {
+    /* Outdated example: 0xF0 0x80 is an ill-formed sequence.  */
+    static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' };
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input, 5);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0x0022);
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input + 1, 4);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0xFFFD);
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input + 2, 3);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0xFFFD);
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input + 3, 2);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0xFFFD);
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input + 4, 1);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0x0022);
+  }
   /* 3.3.8. 4-byte sequence with last byte missing.  */
   {
-    static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' };
+    static const uint8_t input[] = { '"', 0xF3, 0xBF, 0xBF, '"' };
     uc = 0xBADFACE;
     ret = my_u8_mbtouc (&uc, input, 5);
     ASSERT (ret == 1);
@@ -180,6 +233,30 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const 
uint8_t *, size_t))
     ASSERT (ret == 1);
     ASSERT (uc == 0x0022);
   }
+  {
+    /* Outdated example: 0xF7 is an invalid first byte.  */
+    static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' };
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input, 5);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0x0022);
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input + 1, 4);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0xFFFD);
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input + 2, 3);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0xFFFD);
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input + 3, 2);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0xFFFD);
+    uc = 0xBADFACE;
+    ret = my_u8_mbtouc (&uc, input + 4, 1);
+    ASSERT (ret == 1);
+    ASSERT (uc == 0x0022);
+  }
 }
 
 int

unistr/u8-*: Make Unicode decoder more Unicode Standard compliant

Reply via email to