Module Name: src Committed By: riastradh Date: Sun Aug 18 02:19:35 UTC 2024
Modified Files: src/lib/libc/locale: c16rtomb.3 c16rtomb.c c32rtomb.3 c8rtomb.3 c8rtomb.c src/tests/lib/libc/locale: t_c16rtomb.c t_c8rtomb.c Log Message: c8rtomb(3), c16rtomb(3): Fix NUL handling. PR lib/58615: incomplete c8rtomb, c16rtomb handles NUL termination wrong To generate a diff of this commit: cvs rdiff -u -r1.7 -r1.8 src/lib/libc/locale/c16rtomb.3 \ src/lib/libc/locale/c32rtomb.3 cvs rdiff -u -r1.5 -r1.6 src/lib/libc/locale/c16rtomb.c \ src/lib/libc/locale/c8rtomb.3 cvs rdiff -u -r1.4 -r1.5 src/lib/libc/locale/c8rtomb.c cvs rdiff -u -r1.2 -r1.3 src/tests/lib/libc/locale/t_c16rtomb.c \ src/tests/lib/libc/locale/t_c8rtomb.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/lib/libc/locale/c16rtomb.3 diff -u src/lib/libc/locale/c16rtomb.3:1.7 src/lib/libc/locale/c16rtomb.3:1.8 --- src/lib/libc/locale/c16rtomb.3:1.7 Sat Aug 17 01:52:51 2024 +++ src/lib/libc/locale/c16rtomb.3 Sun Aug 18 02:19:35 2024 @@ -1,4 +1,4 @@ -.\" $NetBSD: c16rtomb.3,v 1.7 2024/08/17 01:52:51 riastradh Exp $ +.\" $NetBSD: c16rtomb.3,v 1.8 2024/08/18 02:19:35 riastradh Exp $ .\" .\" Copyright (c) 2024 The NetBSD Foundation, Inc. .\" All rights reserved. @@ -75,11 +75,13 @@ with the same state .Fa ps , the sequence of .Fa c16 -values must be a well-formed UTF-16 code unit sequence. +values must be a well-formed UTF-16 code unit sequence, or an +incomplete UTF-16 code unit sequence followed by null. If .Fa c16 , when appended to the sequence of code units passed in previous calls, -does not form a well-formed UTF-16 code unit sequence, then +is not null and does not form a well-formed UTF-16 code unit sequence, +then .Nm returns .Li (size_t)-1 @@ -95,6 +97,17 @@ is a null pointer, no output is stored, and the return value are unchanged. .Pp If +.Fa c16 +is null, +.Nm +discards any pending incomplete UTF-16 code unit sequence in +.Fa ps , +outputs a (possibly empty) shift sequence to restore the initial state +followed by a null byte, and resets +.Fa ps +to the initial conversion state. +.Pp +If .Fa ps is a null pointer, .Nm @@ -128,10 +141,11 @@ on failure. .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" .Sh EXAMPLES Convert a UTF-16 code unit sequence to a multibyte string, -NUL-terminate it, and print it: +NUL-terminate it (with any shift sequence needed to restore the initial +state), and print it: .Bd -literal -offset indent char16_t c16[] = { 0xd83d, 0xdca9 }; -char buf[__arraycount(c16)*MB_CUR_MAX + 1], *s = buf; +char buf[(__arraycount(c16) + 1)*MB_CUR_MAX], *s = buf; size_t i; mbstate_t mbs = {0}; /* initial conversion state */ @@ -144,7 +158,10 @@ for (i = 0; i < __arraycount(c16); i++) assert(len < sizeof(buf) - (s - buf)); s += len; } -*s = '\e0'; /* NUL-terminate */ +len = c16rtomb(s, L'\e0', &mbs); /* NUL-terminate */ +if (len == (size_t)-1) + err(1, "c16rtomb"); +assert(len <= sizeof(buf) - (s - buf)); printf("%s\en", buf); .Ed .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" @@ -198,32 +215,20 @@ function first appeared in .Nx 11.0 . .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" .Sh BUGS -It is not clear from the standard how -.Nm -is supposed to behave when given a high surrogate code point followed -by a NUL: -.Bd -literal -offset indent -c16rtomb(s, 0xd800, ps); -c16rtomb(s, L'\e0', ps); -.Ed -.Pp -Currently this fails with -.Er EILSEQ -which matches other implementations, but this is at odds with language -in the standard which suggests that passing -.Li L'\e0' -should unconditionally store a null byte and reset -.Fa ps -to the initial conversion state: +The standard requires that a null code unit unconditionally reset the +conversion state and output null: .Bd -filled -offset indent If -.Fa c16 -is a null wide character, a null byte is stored, preceded by any shift +.Fa c8 +is a null character, a null byte is stored, preceded by any shift sequence needed to restore the initial shift state; the resulting state described is the initial conversion state. .Ed .Pp -However, it is unclear what else this should store besides a null -byte. -Should it discard the pending high surrogate, or convert it to -something else and store that? +However, some implementations such as +.Fx 14.0 , +.Ox 7.4 , +and glibc 2.36 ignore this clause and, if the null was preceded by an +incomplete UTF-16 code unit sequence, fail with +.Er EILSEQ +instead. Index: src/lib/libc/locale/c32rtomb.3 diff -u src/lib/libc/locale/c32rtomb.3:1.7 src/lib/libc/locale/c32rtomb.3:1.8 --- src/lib/libc/locale/c32rtomb.3:1.7 Sat Aug 17 01:52:51 2024 +++ src/lib/libc/locale/c32rtomb.3 Sun Aug 18 02:19:35 2024 @@ -1,4 +1,4 @@ -.\" $NetBSD: c32rtomb.3,v 1.7 2024/08/17 01:52:51 riastradh Exp $ +.\" $NetBSD: c32rtomb.3,v 1.8 2024/08/18 02:19:35 riastradh Exp $ .\" .\" Copyright (c) 2024 The NetBSD Foundation, Inc. .\" All rights reserved. @@ -82,6 +82,15 @@ is a null pointer, no output is stored, and the return value are unchanged. .Pp If +.Fa c32 +is null, +.Nm +outputs a (possibly empty) shift sequence to restore the initial state +followed by a null byte and resets +.Fa ps +to the initial conversion state. +.Pp +If .Fa ps is a null pointer, .Nm @@ -115,10 +124,11 @@ on failure. .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" .Sh EXAMPLES Convert a sequence of Unicode scalar values to a multibyte sequence, -NUL-terminate it, and print it: +NUL-terminate it (with any shift sequence needed to restore the initial +state), and print it: .Bd -literal -offset indent char32_t c32[] = { 0x1f4a9, 0x20ac, 0x21 }; -char buf[__arraycountb(c32)*MB_CUR_MAX + 1], *s = buf; +char buf[(__arraycount(c32) + 1)*MB_CUR_MAX], *s = buf; size_t i; mbstate_t mbs = {0}; /* initial conversion state */ @@ -131,7 +141,10 @@ for (i = 0; i < __arraycount(c32); i++) assert(len < sizeof(buf) - (s - buf)); s += len; } -*s = '\e0'; /* NUL-terminate */ +len = c32rtomb(s, L'\e0', &mbs); /* NUL-terminate */ +if (len == (size_t)-1) + err(1, "c32rtomb"); +assert(len <= sizeof(buf) - (s - buf)); printf("%s\en", buf); .Ed .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" Index: src/lib/libc/locale/c16rtomb.c diff -u src/lib/libc/locale/c16rtomb.c:1.5 src/lib/libc/locale/c16rtomb.c:1.6 --- src/lib/libc/locale/c16rtomb.c:1.5 Sat Aug 17 21:24:53 2024 +++ src/lib/libc/locale/c16rtomb.c Sun Aug 18 02:19:35 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: c16rtomb.c,v 1.5 2024/08/17 21:24:53 riastradh Exp $ */ +/* $NetBSD: c16rtomb.c,v 1.6 2024/08/18 02:19:35 riastradh Exp $ */ /*- * Copyright (c) 2024 The NetBSD Foundation, Inc. @@ -66,7 +66,7 @@ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: c16rtomb.c,v 1.5 2024/08/17 21:24:53 riastradh Exp $"); +__RCSID("$NetBSD: c16rtomb.c,v 1.6 2024/08/18 02:19:35 riastradh Exp $"); #include "namespace.h" @@ -140,37 +140,37 @@ c16rtomb_l(char *restrict s, char16_t c1 */ S = (struct c16rtombstate *)(void *)ps; -#if 0 /* - * `If c16 is a null wide character, a null byte is stored, - * preceded by any shift sequence needed to restore the - * initial shift state; the resulting state described is the - * initial conversion state.' + * Handle several cases: * - * XXX But what else gets stored? Do we just discard any - * pending high surrogate, or do we convert it to something - * else, or what? - */ - if (c16 == L'\0') { + * 1. c16 is null. + * 2. Pending high surrogate. + * 3. No pending high surrogate and c16 is a high surrogate. + * 4. No pending high surrogate and c16 is a low surrogate. + * 5. No pending high surrogate and c16 is a BMP scalar value. + */ + if (c16 == L'\0') { /* 1. null */ + /* + * `If c16 is a null wide character, a null byte is + * stored, preceded by any shift sequence needed to + * restore the initial shift state; the resulting + * state described is the initial conversion state.' + * + * So if c16 is null, discard any pending high + * surrogate -- there's nothing we can legitimately do + * with it -- and convert a null scalar value, which by + * definition of c32rtomb writes out any shift sequence + * reset followed by a null byte. + */ S->surrogate = 0; - } -#endif - - /* - * Check whether: - * - * 1. We had previously decoded a high surrogate. - * => Decode the low surrogate -- reject if it's not a low - * surrogate -- and combine them to output a scalar - * value; clear the high surrogate for next time. - * 2. This is a high surrogate. - * => Save it and wait for the low surrogate with no output. - * 3. This is a low surrogate. - * => Reject. - * 4. This is not a surrogate. - * => Output a scalar value. - */ - if (S->surrogate != 0) { /* 1. pending surrogate pair */ + c32 = 0; + } else if (S->surrogate != 0) { /* 2. pending high surrogate */ + /* + * If the previous code unit was a high surrogate, the + * next code unit must be a low surrogate. Reject it + * if not; otherwise clear the high surrogate for next + * time and combine them to output a scalar value. + */ if (c16 < 0xdc00 || c16 > 0xdfff) { errno = EILSEQ; return (size_t)-1; @@ -182,13 +182,27 @@ c16rtomb_l(char *restrict s, char16_t c1 __SHIFTIN(__SHIFTOUT(w2, __BITS(9,0)), __BITS(9,0))); c32 += 0x10000; S->surrogate = 0; - } else if (c16 >= 0xd800 && c16 <= 0xdbff) { /* 2. high surrogate */ + } else if (c16 >= 0xd800 && c16 <= 0xdbff) { /* 3. high surrogate */ + /* + * No pending high surrogate and this code unit is a + * high surrogate. Save it for next time, and output + * nothing -- we don't yet know what the next scalar + * value will be until we receive the low surrogate. + */ S->surrogate = c16; return 0; /* produced nothing */ - } else if (c16 >= 0xdc00 && c16 <= 0xdfff) { /* 3. low surrogate */ + } else if (c16 >= 0xdc00 && c16 <= 0xdfff) { /* 4. low surrogate */ + /* + * No pending high surrogate and this code unit is a + * low surrogate. That's invalid; fail with EILSEQ. + */ errno = EILSEQ; return (size_t)-1; - } else { /* 4. not a surrogate */ + } else { /* 5. not a surrogate */ + /* + * Code unit is a scalar value in the BMP. Just output + * it as is. + */ c32 = c16; } Index: src/lib/libc/locale/c8rtomb.3 diff -u src/lib/libc/locale/c8rtomb.3:1.5 src/lib/libc/locale/c8rtomb.3:1.6 --- src/lib/libc/locale/c8rtomb.3:1.5 Sat Aug 17 00:32:19 2024 +++ src/lib/libc/locale/c8rtomb.3 Sun Aug 18 02:19:35 2024 @@ -1,4 +1,4 @@ -.\" $NetBSD: c8rtomb.3,v 1.5 2024/08/17 00:32:19 riastradh Exp $ +.\" $NetBSD: c8rtomb.3,v 1.6 2024/08/18 02:19:35 riastradh Exp $ .\" .\" Copyright (c) 2024 The NetBSD Foundation, Inc. .\" All rights reserved. @@ -75,11 +75,13 @@ with the same state .Fa ps , the sequence of .Fa c8 -values must be a well-formed UTF-8 code unit sequence. +values must be a well-formed UTF-8 code unit sequence, or an +incomplete UTF-8 code unit sequence followed by null. If .Fa c8 , when appended to the sequence of code units passed in previous calls, -does not form a well-formed UTF-8 code unit sequence, then +is not null and does not form a well-formed UTF-8 code unit sequence, +then .Nm returns .Li (size_t)-1 @@ -95,6 +97,17 @@ is a null pointer, no output is stored, and the return value are unchanged. .Pp If +.Fa c8 +is null, +.Nm +discards any pending incomplete UTF-8 code unit sequence in +.Fa ps , +outputs a (possibly empty) shift sequence to restore the initial state +followed by a null byte, and resets +.Fa ps +to the initial conversion state. +.Pp +If .Fa ps is a null pointer, .Nm @@ -128,10 +141,11 @@ on failure. .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" .Sh EXAMPLES Convert a UTF-8 code unit sequence to a multibyte string, -NUL-terminate it, and print it: +NUL-terminate it (with any shift sequence needed to restore the initial +state), and print it: .Bd -literal -offset indent char8_t c8[] = { 0xf0, 0x9f, 0x92, 0xa9 }; -char buf[__arraycount(c8)*MB_CUR_MAX + 1], *s = buf; +char buf[(__arraycount(c8) + 1)*MB_CUR_MAX], *s = buf; size_t i; mbstate_t mbs = {0}; /* initial conversion state */ @@ -144,7 +158,10 @@ for (i = 0; i < __arraycount(c8); i++) { assert(len < sizeof(buf) - (s - buf)); s += len; } -*s = '\e0'; /* NUL-terminate */ +len = c8rtomb(s, '\e0', &mbs); /* NUL-terminate */ +if (len == (size_t)-1) + err(1, "c16rtomb"); +assert(len <= sizeof(buf) - (s - buf)); printf("%s\en", buf); .Ed .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" @@ -197,26 +214,9 @@ The function first appeared in .Nx 11.0 . .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -.Sh BUGS -It is not clear from the standard how -.Nm -is supposed to behave when given an incomplete UTF-8 code unit sequence -followed by a NUL: -.Bd -literal -offset indent -c8rtomb(s, 0xf0, ps); -c8rtomb(s, 0x9f, ps); -c8rtomb(s, 0x92, ps); -c8rtomb(s, '\e0', ps); -.Ed -.Pp -Currently this fails with -.Er EILSEQ -which matches other implementations, but this is at odds with language -in the standard which suggests that passing -.Li '\e0' -should unconditionally store a null byte and reset -.Fa ps -to the initial conversion state: +.Sh CAVEATS +The standard requires that a null code unit unconditionally reset the +conversion state and output null: .Bd -filled -offset indent If .Fa c8 @@ -225,7 +225,8 @@ sequence needed to restore the initial s described is the initial conversion state. .Ed .Pp -However, it is unclear what else this should store besides a null -byte. -Should it discard the pending UTF-8 code unit sequence, or convert it -to something else and store that? +However, some implementations such as glibc 2.36 ignore this clause +and, if the null was preceded by an incomplete UTF-8 code unit +sequence, fail with +.Er EILSEQ +instead. Index: src/lib/libc/locale/c8rtomb.c diff -u src/lib/libc/locale/c8rtomb.c:1.4 src/lib/libc/locale/c8rtomb.c:1.5 --- src/lib/libc/locale/c8rtomb.c:1.4 Sat Aug 17 21:24:53 2024 +++ src/lib/libc/locale/c8rtomb.c Sun Aug 18 02:19:35 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: c8rtomb.c,v 1.4 2024/08/17 21:24:53 riastradh Exp $ */ +/* $NetBSD: c8rtomb.c,v 1.5 2024/08/18 02:19:35 riastradh Exp $ */ /*- * Copyright (c) 2024 The NetBSD Foundation, Inc. @@ -55,7 +55,7 @@ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: c8rtomb.c,v 1.4 2024/08/17 21:24:53 riastradh Exp $"); +__RCSID("$NetBSD: c8rtomb.c,v 1.5 2024/08/18 02:19:35 riastradh Exp $"); #include "namespace.h" @@ -173,22 +173,21 @@ c8rtomb_l(char *restrict s, char8_t c8, */ S = (struct c8rtombstate *)(void *)ps; -#if 0 /* * `If c8 is a null character, a null byte is stored, preceded * by any shift sequence needed to restore the initial shift * state; the resulting state described is the initial * conversion state.' * - * XXX But what else gets stored? Do we just discard any - * pending sequence, or do we convert it to something else, or - * what? - */ - if (c8 == u8'\0') { - memset(S->buf, 0, sizeof(S->buf)); - S->n = 0; + * So if c8 is null, discard any buffered input -- there's + * nothing we can legitimately do with it -- and convert a null + * scalar value, which by definition of c32rtomb writes out any + * shift sequence reset followed by a null byte. + */ + if (c8 == '\0') { + c32 = 0; + goto accept; } -#endif /* * Get the current state and buffer. @@ -218,6 +217,7 @@ c8rtomb_l(char *restrict s, char8_t c8, __SHIFTIN(c32, __BITS(23,0))); return 0; case UTF8_ACCEPT: + accept: /* * We have a scalar value. Clear the state and output * the scalar value. Index: src/tests/lib/libc/locale/t_c16rtomb.c diff -u src/tests/lib/libc/locale/t_c16rtomb.c:1.2 src/tests/lib/libc/locale/t_c16rtomb.c:1.3 --- src/tests/lib/libc/locale/t_c16rtomb.c:1.2 Sat Aug 17 21:31:22 2024 +++ src/tests/lib/libc/locale/t_c16rtomb.c Sun Aug 18 02:19:35 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: t_c16rtomb.c,v 1.2 2024/08/17 21:31:22 riastradh Exp $ */ +/* $NetBSD: t_c16rtomb.c,v 1.3 2024/08/18 02:19:35 riastradh Exp $ */ /*- * Copyright (c) 2002 Tim J. Robbins @@ -33,7 +33,7 @@ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: t_c16rtomb.c,v 1.2 2024/08/17 21:31:22 riastradh Exp $"); +__RCSID("$NetBSD: t_c16rtomb.c,v 1.3 2024/08/18 02:19:35 riastradh Exp $"); #include <errno.h> #include <limits.h> @@ -108,8 +108,6 @@ ATF_TC_BODY(c16rtomb_c_locale_test, tc) memset(&s, 0, sizeof(s)); memset(buf, 0xcc, sizeof(buf)); ATF_CHECK_EQ_MSG((n = c16rtomb(buf, 0xd83d, &s)), 0, "n=%zu", n); - atf_tc_expect_fail("PR lib/58615:" - " incomplete c8rtomb, c16rtomb handles NUL termination wrong"); ATF_CHECK_EQ_MSG((n = c16rtomb(buf, L'\0', &s)), 1, "n=%zu", n); ATF_CHECK_MSG(((unsigned char)buf[0] == '\0' && (unsigned char)buf[1] == 0xcc), @@ -189,8 +187,6 @@ ATF_TC_BODY(c16rtomb_utf_8_test, tc) memset(&s, 0, sizeof(s)); memset(buf, 0xcc, sizeof(buf)); ATF_CHECK_EQ_MSG((n = c16rtomb(buf, 0xd83d, &s)), 0, "n=%zu", n); - atf_tc_expect_fail("PR lib/58615:" - " incomplete c8rtomb, c16rtomb handles NUL termination wrong"); ATF_CHECK_EQ_MSG((n = c16rtomb(buf, L'\0', &s)), 1, "n=%zu", n); ATF_CHECK_MSG(((unsigned char)buf[0] == '\0' && Index: src/tests/lib/libc/locale/t_c8rtomb.c diff -u src/tests/lib/libc/locale/t_c8rtomb.c:1.2 src/tests/lib/libc/locale/t_c8rtomb.c:1.3 --- src/tests/lib/libc/locale/t_c8rtomb.c:1.2 Sat Aug 17 21:31:22 2024 +++ src/tests/lib/libc/locale/t_c8rtomb.c Sun Aug 18 02:19:35 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: t_c8rtomb.c,v 1.2 2024/08/17 21:31:22 riastradh Exp $ */ +/* $NetBSD: t_c8rtomb.c,v 1.3 2024/08/18 02:19:35 riastradh Exp $ */ /*- * Copyright (c) 2002 Tim J. Robbins @@ -33,7 +33,7 @@ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: t_c8rtomb.c,v 1.2 2024/08/17 21:31:22 riastradh Exp $"); +__RCSID("$NetBSD: t_c8rtomb.c,v 1.3 2024/08/18 02:19:35 riastradh Exp $"); #include <errno.h> #include <limits.h> @@ -117,8 +117,6 @@ ATF_TC_BODY(c8rtomb_c_locale_test, tc) memset(&s, 0, sizeof(s)); memset(buf, 0xcc, sizeof(buf)); ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xf0, &s)), 0, "n=%zu", n); - atf_tc_expect_fail("PR lib/58615:" - " incomplete c8rtomb, c16rtomb handles NUL termination wrong"); ATF_CHECK_EQ_MSG((n = c8rtomb(buf, '\0', &s)), 1, "n=%zu", n); ATF_CHECK_MSG(((unsigned char)buf[0] == '\0' && (unsigned char)buf[1] == 0xcc), @@ -225,8 +223,6 @@ ATF_TC_BODY(c8rtomb_utf_8_test, tc) memset(&s, 0, sizeof(s)); memset(buf, 0xcc, sizeof(buf)); ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xf0, &s)), 0, "n=%zu", n); - atf_tc_expect_fail("PR lib/58615:" - " incomplete c8rtomb, c16rtomb handles NUL termination wrong"); ATF_CHECK_EQ_MSG((n = c8rtomb(buf, '\0', &s)), 1, "n=%zu", n); ATF_CHECK_MSG(((unsigned char)buf[0] == '\0' && (unsigned char)buf[1] == 0xcc),