This patch makes gnulib's and libunistring's UTF-8 decoder (mainly u8_mbtouc) more Unicode Standard compliant, regarding https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf section 3.9.
2023-07-25 Bruno Haible <br...@clisp.org> unistr/u8-*: Make Unicode decoder more Unicode Standard compliant. Based on a remark by Paul Eggert in <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00120.html>. * tests/unistr/test-u8-mbtouc.c (test_safe_function): Change expected results for "non-shortest form" or out-of-range byte sequences. Add new test cases of incomplete well-formed byte sequences. * tests/unistr/test-u8-mbsnlen.c (main): Likewise. * lib/unistr/u8-mbtouc-aux.c (u8_mbtouc_aux): Reject a first byte in the range 0xF5..0xF7 as invalid. Distinguish incomplete from invalid byte sequences correctly. For the former, return only the number of bytes in the maximal well-formed subpart. * lib/unistr/u8-mbtouc.c (u8_mbtouc): Likewise. * lib/unistr/u8-check.c (u8_check): Reject a first byte in the range 0xF5..0xF7 as invalid. * lib/unistr/u8-mblen.c (u8_mblen): Likewise. * lib/unistr/u8-mbtoucr.c (u8_mbtoucr): Likewise. * lib/unistr/u8-strmbtouc.c (u8_strmbtouc): Likewise. * lib/unistr/u8-strmblen.c (u8_strmblen): Likewise. * lib/unistr/u8-prev.c (u8_prev): Likewise. diff --git a/lib/unistr/u8-check.c b/lib/unistr/u8-check.c index 2f03cd9af0..53217006ea 100644 --- a/lib/unistr/u8-check.c +++ b/lib/unistr/u8-check.c @@ -57,13 +57,13 @@ u8_check (const uint8_t *s, size_t n) continue; } } - else if (c < 0xf8) + else if (c <= 0xf4) { if (s + 4 <= s_end && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90) - && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))) + && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90))) { s += 4; continue; diff --git a/lib/unistr/u8-mblen.c b/lib/unistr/u8-mblen.c index a5f88dedef..d989afc437 100644 --- a/lib/unistr/u8-mblen.c +++ b/lib/unistr/u8-mblen.c @@ -47,13 +47,13 @@ u8_mblen (const uint8_t *s, size_t n) && (c != 0xed || s[1] < 0xa0)) return 3; } - else if (c < 0xf8) + else if (c <= 0xf4) { if (n >= 4 && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90) - && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))) + && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90))) return 4; } } diff --git a/lib/unistr/u8-mbtouc-aux.c b/lib/unistr/u8-mbtouc-aux.c index a6b7edcfb9..15568c3bc8 100644 --- a/lib/unistr/u8-mbtouc-aux.c +++ b/lib/unistr/u8-mbtouc-aux.c @@ -52,20 +52,15 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n) { if (n >= 3) { - if ((s[1] ^ 0x80) < 0x40) + if ((s[1] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) { if ((s[2] ^ 0x80) < 0x40) { - if ((c >= 0xe1 || s[1] >= 0xa0) - && (c != 0xed || s[1] < 0xa0)) - { - *puc = ((unsigned int) (c & 0x0f) << 12) - | ((unsigned int) (s[1] ^ 0x80) << 6) - | (unsigned int) (s[2] ^ 0x80); - return 3; - } - /* invalid multibyte character */ - *puc = 0xfffd; + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); return 3; } /* invalid multibyte character */ @@ -73,38 +68,50 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n) return 2; } /* invalid multibyte character */ + *puc = 0xfffd; + return 1; } else { - /* incomplete multibyte character */ *puc = 0xfffd; - if (n == 1 || (s[1] ^ 0x80) >= 0x40) - return 1; + if (n == 1) + { + /* incomplete multibyte character */ + return 1; + } else - return 2; + { + if ((s[1] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) + { + /* incomplete multibyte character */ + return 2; + } + else + { + /* invalid multibyte character */ + return 1; + } + } } } - else if (c < 0xf8) + else if (c <= 0xf4) { if (n >= 4) { - if ((s[1] ^ 0x80) < 0x40) + if ((s[1] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) + && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90))) { if ((s[2] ^ 0x80) < 0x40) { if ((s[3] ^ 0x80) < 0x40) { - if ((c >= 0xf1 || s[1] >= 0x90) - && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))) - { - *puc = ((unsigned int) (c & 0x07) << 18) - | ((unsigned int) (s[1] ^ 0x80) << 12) - | ((unsigned int) (s[2] ^ 0x80) << 6) - | (unsigned int) (s[3] ^ 0x80); - return 4; - } - /* invalid multibyte character */ - *puc = 0xfffd; + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); return 4; } /* invalid multibyte character */ @@ -116,17 +123,48 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n) return 2; } /* invalid multibyte character */ + *puc = 0xfffd; + return 1; } else { - /* incomplete multibyte character */ *puc = 0xfffd; - if (n == 1 || (s[1] ^ 0x80) >= 0x40) - return 1; - else if (n == 2 || (s[2] ^ 0x80) >= 0x40) - return 2; + if (n == 1) + { + /* incomplete multibyte character */ + return 1; + } else - return 3; + { + if ((s[1] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) + && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90))) + { + if (n == 2) + { + /* incomplete multibyte character */ + return 2; + } + else + { + if ((s[2] ^ 0x80) < 0x40) + { + /* incomplete multibyte character */ + return 3; + } + else + { + /* invalid multibyte character */ + return 2; + } + } + } + else + { + /* invalid multibyte character */ + return 1; + } + } } } } diff --git a/lib/unistr/u8-mbtouc.c b/lib/unistr/u8-mbtouc.c index e30e5203c1..920ad6f558 100644 --- a/lib/unistr/u8-mbtouc.c +++ b/lib/unistr/u8-mbtouc.c @@ -62,20 +62,15 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n) { if (n >= 3) { - if ((s[1] ^ 0x80) < 0x40) + if ((s[1] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) { if ((s[2] ^ 0x80) < 0x40) { - if ((c >= 0xe1 || s[1] >= 0xa0) - && (c != 0xed || s[1] < 0xa0)) - { - *puc = ((unsigned int) (c & 0x0f) << 12) - | ((unsigned int) (s[1] ^ 0x80) << 6) - | (unsigned int) (s[2] ^ 0x80); - return 3; - } - /* invalid multibyte character */ - *puc = 0xfffd; + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); return 3; } /* invalid multibyte character */ @@ -83,38 +78,50 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n) return 2; } /* invalid multibyte character */ + *puc = 0xfffd; + return 1; } else { - /* incomplete multibyte character */ *puc = 0xfffd; - if (n == 1 || (s[1] ^ 0x80) >= 0x40) - return 1; + if (n == 1) + { + /* incomplete multibyte character */ + return 1; + } else - return 2; + { + if ((s[1] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) + { + /* incomplete multibyte character */ + return 2; + } + else + { + /* invalid multibyte character */ + return 1; + } + } } } - else if (c < 0xf8) + else if (c <= 0xf4) { if (n >= 4) { - if ((s[1] ^ 0x80) < 0x40) + if ((s[1] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) + && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90))) { if ((s[2] ^ 0x80) < 0x40) { if ((s[3] ^ 0x80) < 0x40) { - if ((c >= 0xf1 || s[1] >= 0x90) - && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))) - { - *puc = ((unsigned int) (c & 0x07) << 18) - | ((unsigned int) (s[1] ^ 0x80) << 12) - | ((unsigned int) (s[2] ^ 0x80) << 6) - | (unsigned int) (s[3] ^ 0x80); - return 4; - } - /* invalid multibyte character */ - *puc = 0xfffd; + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); return 4; } /* invalid multibyte character */ @@ -126,17 +133,48 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n) return 2; } /* invalid multibyte character */ + *puc = 0xfffd; + return 1; } else { - /* incomplete multibyte character */ *puc = 0xfffd; - if (n == 1 || (s[1] ^ 0x80) >= 0x40) - return 1; - else if (n == 2 || (s[2] ^ 0x80) >= 0x40) - return 2; + if (n == 1) + { + /* incomplete multibyte character */ + return 1; + } else - return 3; + { + if ((s[1] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) + && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90))) + { + if (n == 2) + { + /* incomplete multibyte character */ + return 2; + } + else + { + if ((s[2] ^ 0x80) < 0x40) + { + /* incomplete multibyte character */ + return 3; + } + else + { + /* invalid multibyte character */ + return 2; + } + } + } + else + { + /* invalid multibyte character */ + return 1; + } + } } } } diff --git a/lib/unistr/u8-mbtoucr.c b/lib/unistr/u8-mbtoucr.c index d09051128f..296062d233 100644 --- a/lib/unistr/u8-mbtoucr.c +++ b/lib/unistr/u8-mbtoucr.c @@ -86,13 +86,13 @@ u8_mbtoucr (ucs4_t *puc, const uint8_t *s, size_t n) return -2; } } - else if (c < 0xf8) + else if (c <= 0xf4) { if (n >= 2) { if ((s[1] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90) - && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))) + && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90))) { if (n >= 3) { diff --git a/lib/unistr/u8-prev.c b/lib/unistr/u8-prev.c index 1012486b36..ad8a347c19 100644 --- a/lib/unistr/u8-prev.c +++ b/lib/unistr/u8-prev.c @@ -63,9 +63,9 @@ u8_prev (ucs4_t *puc, const uint8_t *s, const uint8_t *start) { uint8_t c_4 = s[-4]; - if (c_4 >= 0xf0 && c_4 < 0xf8 + if (c_4 >= 0xf0 && c_4 <= 0xf4 && (c_4 >= 0xf1 || c_3 >= 0x90) - && (c_4 < 0xf4 || (c_4 == 0xf4 && c_3 < 0x90))) + && (c_4 < 0xf4 || (/* c_4 == 0xf4 && */ c_3 < 0x90))) { *puc = ((unsigned int) (c_4 & 0x07) << 18) | ((unsigned int) (c_3 ^ 0x80) << 12) diff --git a/lib/unistr/u8-strmblen.c b/lib/unistr/u8-strmblen.c index 558771341a..a34a01fc14 100644 --- a/lib/unistr/u8-strmblen.c +++ b/lib/unistr/u8-strmblen.c @@ -51,12 +51,12 @@ u8_strmblen (const uint8_t *s) && (c != 0xed || s[1] < 0xa0)) return 3; } - else if (c < 0xf8) + else if (c <= 0xf4) { if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90) - && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))) + && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90))) return 4; } } diff --git a/lib/unistr/u8-strmbtouc.c b/lib/unistr/u8-strmbtouc.c index a47fbbb84f..259d3c2f37 100644 --- a/lib/unistr/u8-strmbtouc.c +++ b/lib/unistr/u8-strmbtouc.c @@ -63,12 +63,12 @@ u8_strmbtouc (ucs4_t *puc, const uint8_t *s) return 3; } } - else if (c < 0xf8) + else if (c <= 0xf4) { if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90) - && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))) + && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90))) { *puc = ((unsigned int) (c & 0x07) << 18) | ((unsigned int) (s[1] ^ 0x80) << 12) diff --git a/tests/unistr/test-u8-mbsnlen.c b/tests/unistr/test-u8-mbsnlen.c index c0b9b6e3f1..67b80d02a7 100644 --- a/tests/unistr/test-u8-mbsnlen.c +++ b/tests/unistr/test-u8-mbsnlen.c @@ -61,9 +61,18 @@ main () that a "malformed sequence" is interpreted in the same way as "a character that is outside the adopted subset". Reference: + ISO 10646-1 amendment 2 + <https://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html> Markus Kuhn: UTF-8 decoder capability and stress test <https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt> <https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html> + These old specifications (from ca. 2000) were a bit ambiguous, and the + definition of UTF-8 has changed a bit as well. The newer specification + we obey is the Unicode Standard, version 15. + Reference: + Unicode Standard 15.0.0, section 3.9 + <https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf> + pages 124..129, especially table 3-7. */ /* 3.1. Test that each unexpected continuation byte is signalled as a malformed sequence of its own. */ @@ -97,9 +106,14 @@ main () } /* 3.3.2. 3-byte sequence with last byte missing. */ { - static const uint8_t input[] = { '"', 0xE0, 0x80, '"' }; + static const uint8_t input[] = { '"', 0xE0, 0xA0, '"' }; ASSERT (u8_mbsnlen (input, 4) == 3); } + { + /* Outdated example: 0xE0 0x80 is an ill-formed sequence. */ + static const uint8_t input[] = { '"', 0xE0, 0x80, '"' }; + ASSERT (u8_mbsnlen (input, 4) == 4); + } /* 3.3.7. 3-byte sequence with last byte missing. */ { static const uint8_t input[] = { '"', 0xEF, 0xBF, '"' }; @@ -107,14 +121,24 @@ main () } /* 3.3.3. 4-byte sequence with last byte missing. */ { - static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' }; + static const uint8_t input[] = { '"', 0xF0, 0x90, 0x80, '"' }; ASSERT (u8_mbsnlen (input, 5) == 3); } + { + /* Outdated example: 0xF0 0x80 is an ill-formed sequence. */ + static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' }; + ASSERT (u8_mbsnlen (input, 5) == 5); + } /* 3.3.8. 4-byte sequence with last byte missing. */ { - static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' }; + static const uint8_t input[] = { '"', 0xF3, 0xBF, 0xBF, '"' }; ASSERT (u8_mbsnlen (input, 5) == 3); } + { + /* Outdated example: 0xF7 is an invalid first byte. */ + static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' }; + ASSERT (u8_mbsnlen (input, 5) == 5); + } return 0; } diff --git a/tests/unistr/test-u8-mbtouc.c b/tests/unistr/test-u8-mbtouc.c index 35c70c2193..a695ba1c70 100644 --- a/tests/unistr/test-u8-mbtouc.c +++ b/tests/unistr/test-u8-mbtouc.c @@ -34,9 +34,18 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const uint8_t *, size_t)) that a "malformed sequence" is interpreted in the same way as "a character that is outside the adopted subset". Reference: + ISO 10646-1 amendment 2 + <https://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html> Markus Kuhn: UTF-8 decoder capability and stress test <https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt> <https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html> + These old specifications (from ca. 2000) were a bit ambiguous, and the + definition of UTF-8 has changed a bit as well. The newer specification + we obey is the Unicode Standard, version 15. + Reference: + Unicode Standard 15.0.0, section 3.9 + <https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf> + pages 124..129, especially table 3-7. */ /* 3.1. Test that each unexpected continuation byte is signalled as a malformed sequence of its own. */ @@ -118,7 +127,7 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const uint8_t *, size_t)) } /* 3.3.2. 3-byte sequence with last byte missing. */ { - static const uint8_t input[] = { '"', 0xE0, 0x80, '"' }; + static const uint8_t input[] = { '"', 0xE0, 0xA0, '"' }; uc = 0xBADFACE; ret = my_u8_mbtouc (&uc, input, 4); ASSERT (ret == 1); @@ -132,6 +141,26 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const uint8_t *, size_t)) ASSERT (ret == 1); ASSERT (uc == 0x0022); } + { + /* Outdated example: 0xE0 0x80 is an ill-formed sequence. */ + static const uint8_t input[] = { '"', 0xE0, 0x80, '"' }; + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input, 4); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 1, 3); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 2, 2); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 3, 1); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + } /* 3.3.7. 3-byte sequence with last byte missing. */ { static const uint8_t input[] = { '"', 0xEF, 0xBF, '"' }; @@ -150,7 +179,7 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const uint8_t *, size_t)) } /* 3.3.3. 4-byte sequence with last byte missing. */ { - static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' }; + static const uint8_t input[] = { '"', 0xF0, 0x90, 0x80, '"' }; uc = 0xBADFACE; ret = my_u8_mbtouc (&uc, input, 5); ASSERT (ret == 1); @@ -164,9 +193,33 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const uint8_t *, size_t)) ASSERT (ret == 1); ASSERT (uc == 0x0022); } + { + /* Outdated example: 0xF0 0x80 is an ill-formed sequence. */ + static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' }; + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input, 5); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 1, 4); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 2, 3); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 3, 2); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 4, 1); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + } /* 3.3.8. 4-byte sequence with last byte missing. */ { - static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' }; + static const uint8_t input[] = { '"', 0xF3, 0xBF, 0xBF, '"' }; uc = 0xBADFACE; ret = my_u8_mbtouc (&uc, input, 5); ASSERT (ret == 1); @@ -180,6 +233,30 @@ test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const uint8_t *, size_t)) ASSERT (ret == 1); ASSERT (uc == 0x0022); } + { + /* Outdated example: 0xF7 is an invalid first byte. */ + static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' }; + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input, 5); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 1, 4); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 2, 3); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 3, 2); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 4, 1); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + } } int