Module Name: src Committed By: riastradh Date: Mon Aug 19 16:22:10 UTC 2024
Modified Files: src/lib/libc/locale: c32rtomb.c c32rtomb.h src/tests/lib/libc/locale: t_c16rtomb.c t_c8rtomb.c Log Message: c32rtomb(3): Use conversion state to handle shift sequences. For conversion of Unicode scalar values to coding systems requiring shift sequences, such as ISO-2022-JP, _citrus_iconv_convert will always produce: 1. a shift sequence from the initial state to some nondefault state, like from US-ASCII to JIS X 0208 2. the encoding of the desired characater 3. a shift sequence restoring the initial state This is unnecessary if the output is already in the state needed to encoded the desired character. For example, this method produces seven bytes to encode each YEN SIGN in ISO-2022-JP -- and fourteen, to encode two consecutive ones -- even though the shift sequence is only three bytes long and once shifted YEN SIGN takes only one byte. Instead, convert the Unicode scalar value to a locale-dependent wide character and encode that, by composing - _citrus_iconv_convert => gives us a multibyte encoding of the character from the initial state (and restoring the initial state afterward) - mbrtowc with initial conversion state => gives us the single wide character representation XXX If combining characters are possible here, this may fail. - wcrtomb with caller's conversion tsate => gives us a state-dependent multibyte encoding of the character XXX Is there a cheaper way to convert from Unicode scalar value to locale-dependent wide character? It is not obvious to me from the largely undocumented Citrus machinery, but it would obviously be better than this somewhat circuitous Rube Goldberg contraption of chained multibyte APIs. PR lib/58612: c8rtomb/c16rtomb/c32rtomb yield suboptimal shift sequences To generate a diff of this commit: cvs rdiff -u -r1.3 -r1.4 src/lib/libc/locale/c32rtomb.c cvs rdiff -u -r1.1 -r1.2 src/lib/libc/locale/c32rtomb.h cvs rdiff -u -r1.5 -r1.6 src/tests/lib/libc/locale/t_c16rtomb.c cvs rdiff -u -r1.6 -r1.7 src/tests/lib/libc/locale/t_c8rtomb.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/lib/libc/locale/c32rtomb.c diff -u src/lib/libc/locale/c32rtomb.c:1.3 src/lib/libc/locale/c32rtomb.c:1.4 --- src/lib/libc/locale/c32rtomb.c:1.3 Sat Aug 17 21:24:53 2024 +++ src/lib/libc/locale/c32rtomb.c Mon Aug 19 16:22:10 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: c32rtomb.c,v 1.3 2024/08/17 21:24:53 riastradh Exp $ */ +/* $NetBSD: c32rtomb.c,v 1.4 2024/08/19 16:22:10 riastradh Exp $ */ /*- * Copyright (c) 2024 The NetBSD Foundation, Inc. @@ -49,7 +49,7 @@ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: c32rtomb.c,v 1.3 2024/08/17 21:24:53 riastradh Exp $"); +__RCSID("$NetBSD: c32rtomb.c,v 1.4 2024/08/19 16:22:10 riastradh Exp $"); #include "namespace.h" @@ -89,12 +89,17 @@ size_t c32rtomb_l(char *restrict s, char32_t c32, mbstate_t *restrict ps, locale_t loc) { - char buf[MB_LEN_MAX]; + static mbstate_t psbuf; struct _citrus_iconv *iconv = NULL; - char srcbuf[4]; + char buf[2*MB_LEN_MAX]; /* [shift from init, wc] [shift to init] */ + char utf32le[4]; const char *src; char *dst; - size_t srcleft, dstleft, inval, len; + size_t srcleft, dstleft, inval; + mbstate_t mbrtowcstate = {0}; + wchar_t wc; + int wlen; + size_t len; int error, errno_save; /* @@ -103,6 +108,18 @@ c32rtomb_l(char *restrict s, char32_t c3 errno_save = errno; /* + * `If ps is a null pointer, each function uses its own + * internal mbstate_t object instead, which is initialized at + * program startup to the initial conversion state; the + * functions are not required to avoid data races with other + * calls to the same function in this case. The + * implementation behaves as if no library function calls + * these functions with a null pointer for ps.' + */ + if (ps == NULL) + ps = &psbuf; + + /* * `If s is a null pointer, the c32rtomb function is equivalent * to the call * @@ -116,7 +133,10 @@ c32rtomb_l(char *restrict s, char32_t c3 } /* - * Reject surrogates. + * Reject surrogate code points. We only deal in scalar + * values. + * + * XXX Is this necessary? Won't iconv take care of it for us? */ if (c32 >= 0xd800 && c32 <= 0xdfff) { errno = EILSEQ; @@ -136,16 +156,14 @@ c32rtomb_l(char *restrict s, char32_t c3 } /* - * Convert from UTF-32LE in our buffer. + * Convert from UTF-32LE to a multibyte sequence. */ - le32enc(srcbuf, c32); - src = srcbuf; - srcleft = sizeof(srcbuf); - dst = s; + le32enc(utf32le, c32); + src = utf32le; + srcleft = sizeof(utf32le); + dst = buf; dstleft = MB_CUR_MAX; - error = _citrus_iconv_convert(iconv, - &src, &srcleft, - &dst, &dstleft, + error = _citrus_iconv_convert(iconv, &src, &srcleft, &dst, &dstleft, _CITRUS_ICONV_F_HIDE_INVALID, &inval); if (error) { /* can't be incomplete, must be error */ errno = error; @@ -168,6 +186,36 @@ c32rtomb_l(char *restrict s, char32_t c3 } /* + * Now get a wide character out of the buffer. We don't care + * how much it consumes other than for a diagnostic assertion. + * It had better return exactly one wide character, because we + * are only allowed to encode one wide character's worth of + * multibyte output (possibly including a shift sequence). + * + * XXX What about combining characters? + */ + wlen = mbrtowc_l(&wc, buf, len, &mbrtowcstate, loc); + switch (wlen) { + case -1: /* error, with errno set */ + goto out; + case 0: /* decoded NUL */ + wc = 0; /* paranoia */ + break; + default: /* decoded wc */ + _DIAGASSERT(wlen > 0); + _DIAGASSERT((size_t)wlen <= len); + } + + /* + * Now put the wide character out, using the caller's + * conversion state so that we don't output unnecessary shift + * sequences. + */ + len = wcrtomb_l(s, wc, ps, loc); + if (len == (size_t)-1) /* error, with errno set */ + goto out; + + /* * Make sure we preserve errno on success. */ errno = errno_save; Index: src/lib/libc/locale/c32rtomb.h diff -u src/lib/libc/locale/c32rtomb.h:1.1 src/lib/libc/locale/c32rtomb.h:1.2 --- src/lib/libc/locale/c32rtomb.h:1.1 Thu Aug 15 14:16:33 2024 +++ src/lib/libc/locale/c32rtomb.h Mon Aug 19 16:22:10 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: c32rtomb.h,v 1.1 2024/08/15 14:16:33 riastradh Exp $ */ +/* $NetBSD: c32rtomb.h,v 1.2 2024/08/19 16:22:10 riastradh Exp $ */ /*- * Copyright (c) 2024 The NetBSD Foundation, Inc. @@ -30,6 +30,10 @@ #define LIB_LIBC_LOCALE_C32RTOMB_H_ struct c32rtombstate { + /* + * XXX This needs to match the maximum size of any conversion + * state actually used by wcrtomb_l. + */ char dummy; }; Index: src/tests/lib/libc/locale/t_c16rtomb.c diff -u src/tests/lib/libc/locale/t_c16rtomb.c:1.5 src/tests/lib/libc/locale/t_c16rtomb.c:1.6 --- src/tests/lib/libc/locale/t_c16rtomb.c:1.5 Mon Aug 19 16:21:47 2024 +++ src/tests/lib/libc/locale/t_c16rtomb.c Mon Aug 19 16:22:10 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: t_c16rtomb.c,v 1.5 2024/08/19 16:21:47 riastradh Exp $ */ +/* $NetBSD: t_c16rtomb.c,v 1.6 2024/08/19 16:22:10 riastradh Exp $ */ /*- * Copyright (c) 2002 Tim J. Robbins @@ -33,7 +33,7 @@ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: t_c16rtomb.c,v 1.5 2024/08/19 16:21:47 riastradh Exp $"); +__RCSID("$NetBSD: t_c16rtomb.c,v 1.6 2024/08/19 16:22:10 riastradh Exp $"); #include <errno.h> #include <limits.h> @@ -155,8 +155,6 @@ ATF_TC_BODY(c16rtomb_iso2022jp_locale_te p = buf; ATF_CHECK_EQ_MSG((n = c16rtomb(p, L'A', &s)), 1, "n=%zu", n); /* 1 */ p += 1; - atf_tc_expect_fail("PR lib/58612:" - " c8rtomb/c16rtomb/c32rtomb yield suboptimal shift sequences"); ATF_CHECK_EQ_MSG((n = c16rtomb(p, 0xa5, &s)), 4, "n=%zu", n); /* 2 */ p += 4; ATF_CHECK_EQ_MSG((n = c16rtomb(p, 0xa5, &s)), 1, "n=%zu", n); /* 3 */ Index: src/tests/lib/libc/locale/t_c8rtomb.c diff -u src/tests/lib/libc/locale/t_c8rtomb.c:1.6 src/tests/lib/libc/locale/t_c8rtomb.c:1.7 --- src/tests/lib/libc/locale/t_c8rtomb.c:1.6 Mon Aug 19 16:21:47 2024 +++ src/tests/lib/libc/locale/t_c8rtomb.c Mon Aug 19 16:22:10 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: t_c8rtomb.c,v 1.6 2024/08/19 16:21:47 riastradh Exp $ */ +/* $NetBSD: t_c8rtomb.c,v 1.7 2024/08/19 16:22:10 riastradh Exp $ */ /*- * Copyright (c) 2002 Tim J. Robbins @@ -33,7 +33,7 @@ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: t_c8rtomb.c,v 1.6 2024/08/19 16:21:47 riastradh Exp $"); +__RCSID("$NetBSD: t_c8rtomb.c,v 1.7 2024/08/19 16:22:10 riastradh Exp $"); #include <errno.h> #include <limits.h> @@ -191,8 +191,6 @@ ATF_TC_BODY(c8rtomb_iso2022jp_locale_tes ATF_CHECK_EQ_MSG((n = c8rtomb(p, 'A', &s)), 1, "n=%zu", n); /* 1 */ p += 1; ATF_CHECK_EQ_MSG((n = c8rtomb(p, 0xc2, &s)), 0, "n=%zu", n); /* 2 */ - atf_tc_expect_fail("PR lib/58612:" - " c8rtomb/c16rtomb/c32rtomb yield suboptimal shift sequences"); ATF_CHECK_EQ_MSG((n = c8rtomb(p, 0xa5, &s)), 4, "n=%zu", n); p += 4; ATF_CHECK_EQ_MSG((n = c8rtomb(p, 0xc2, &s)), 0, "n=%zu", n); /* 3 */