Module Name:    src
Committed By:   riastradh
Date:           Mon Aug 19 16:22:10 UTC 2024

Modified Files:
        src/lib/libc/locale: c32rtomb.c c32rtomb.h
        src/tests/lib/libc/locale: t_c16rtomb.c t_c8rtomb.c

Log Message:
c32rtomb(3): Use conversion state to handle shift sequences.

For conversion of Unicode scalar values to coding systems requiring
shift sequences, such as ISO-2022-JP, _citrus_iconv_convert will
always produce:

1. a shift sequence from the initial state to some nondefault state,
   like from US-ASCII to JIS X 0208
2. the encoding of the desired characater
3. a shift sequence restoring the initial state

This is unnecessary if the output is already in the state needed to
encoded the desired character.  For example, this method produces
seven bytes to encode each YEN SIGN in ISO-2022-JP -- and fourteen,
to encode two consecutive ones -- even though the shift sequence is
only three bytes long and once shifted YEN SIGN takes only one byte.

Instead, convert the Unicode scalar value to a locale-dependent wide
character and encode that, by composing

- _citrus_iconv_convert
  => gives us a multibyte encoding of the character from the initial
     state (and restoring the initial state afterward)
- mbrtowc with initial conversion state
  => gives us the single wide character representation
     XXX If combining characters are possible here, this may fail.
- wcrtomb with caller's conversion tsate
  => gives us a state-dependent multibyte encoding of the character

XXX Is there a cheaper way to convert from Unicode scalar value to
locale-dependent wide character?  It is not obvious to me from the
largely undocumented Citrus machinery, but it would obviously be
better than this somewhat circuitous Rube Goldberg contraption of
chained multibyte APIs.

PR lib/58612: c8rtomb/c16rtomb/c32rtomb yield suboptimal shift
sequences


To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 src/lib/libc/locale/c32rtomb.c
cvs rdiff -u -r1.1 -r1.2 src/lib/libc/locale/c32rtomb.h
cvs rdiff -u -r1.5 -r1.6 src/tests/lib/libc/locale/t_c16rtomb.c
cvs rdiff -u -r1.6 -r1.7 src/tests/lib/libc/locale/t_c8rtomb.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/lib/libc/locale/c32rtomb.c
diff -u src/lib/libc/locale/c32rtomb.c:1.3 src/lib/libc/locale/c32rtomb.c:1.4
--- src/lib/libc/locale/c32rtomb.c:1.3	Sat Aug 17 21:24:53 2024
+++ src/lib/libc/locale/c32rtomb.c	Mon Aug 19 16:22:10 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: c32rtomb.c,v 1.3 2024/08/17 21:24:53 riastradh Exp $	*/
+/*	$NetBSD: c32rtomb.c,v 1.4 2024/08/19 16:22:10 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2024 The NetBSD Foundation, Inc.
@@ -49,7 +49,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: c32rtomb.c,v 1.3 2024/08/17 21:24:53 riastradh Exp $");
+__RCSID("$NetBSD: c32rtomb.c,v 1.4 2024/08/19 16:22:10 riastradh Exp $");
 
 #include "namespace.h"
 
@@ -89,12 +89,17 @@ size_t
 c32rtomb_l(char *restrict s, char32_t c32, mbstate_t *restrict ps,
     locale_t loc)
 {
-	char buf[MB_LEN_MAX];
+	static mbstate_t psbuf;
 	struct _citrus_iconv *iconv = NULL;
-	char srcbuf[4];
+	char buf[2*MB_LEN_MAX];	/* [shift from init, wc] [shift to init] */
+	char utf32le[4];
 	const char *src;
 	char *dst;
-	size_t srcleft, dstleft, inval, len;
+	size_t srcleft, dstleft, inval;
+	mbstate_t mbrtowcstate = {0};
+	wchar_t wc;
+	int wlen;
+	size_t len;
 	int error, errno_save;
 
 	/*
@@ -103,6 +108,18 @@ c32rtomb_l(char *restrict s, char32_t c3
 	errno_save = errno;
 
 	/*
+	 * `If ps is a null pointer, each function uses its own
+	 *  internal mbstate_t object instead, which is initialized at
+	 *  program startup to the initial conversion state; the
+	 *  functions are not required to avoid data races with other
+	 *  calls to the same function in this case.  The
+	 *  implementation behaves as if no library function calls
+	 *  these functions with a null pointer for ps.'
+	 */
+	if (ps == NULL)
+		ps = &psbuf;
+
+	/*
 	 * `If s is a null pointer, the c32rtomb function is equivalent
 	 *  to the call
 	 *
@@ -116,7 +133,10 @@ c32rtomb_l(char *restrict s, char32_t c3
 	}
 
 	/*
-	 * Reject surrogates.
+	 * Reject surrogate code points.  We only deal in scalar
+	 * values.
+	 *
+	 * XXX Is this necessary?  Won't iconv take care of it for us?
 	 */
 	if (c32 >= 0xd800 && c32 <= 0xdfff) {
 		errno = EILSEQ;
@@ -136,16 +156,14 @@ c32rtomb_l(char *restrict s, char32_t c3
 	}
 
 	/*
-	 * Convert from UTF-32LE in our buffer.
+	 * Convert from UTF-32LE to a multibyte sequence.
 	 */
-	le32enc(srcbuf, c32);
-	src = srcbuf;
-	srcleft = sizeof(srcbuf);
-	dst = s;
+	le32enc(utf32le, c32);
+	src = utf32le;
+	srcleft = sizeof(utf32le);
+	dst = buf;
 	dstleft = MB_CUR_MAX;
-	error = _citrus_iconv_convert(iconv,
-	    &src, &srcleft,
-	    &dst, &dstleft,
+	error = _citrus_iconv_convert(iconv, &src, &srcleft, &dst, &dstleft,
 	    _CITRUS_ICONV_F_HIDE_INVALID, &inval);
 	if (error) {		/* can't be incomplete, must be error */
 		errno = error;
@@ -168,6 +186,36 @@ c32rtomb_l(char *restrict s, char32_t c3
 	}
 
 	/*
+	 * Now get a wide character out of the buffer.  We don't care
+	 * how much it consumes other than for a diagnostic assertion.
+	 * It had better return exactly one wide character, because we
+	 * are only allowed to encode one wide character's worth of
+	 * multibyte output (possibly including a shift sequence).
+	 *
+	 * XXX What about combining characters?
+	 */
+	wlen = mbrtowc_l(&wc, buf, len, &mbrtowcstate, loc);
+	switch (wlen) {
+	case -1:		/* error, with errno set */
+		goto out;
+	case 0:			/* decoded NUL */
+		wc = 0;		/* paranoia */
+		break;
+	default:		/* decoded wc */
+		_DIAGASSERT(wlen > 0);
+		_DIAGASSERT((size_t)wlen <= len);
+	}
+
+	/*
+	 * Now put the wide character out, using the caller's
+	 * conversion state so that we don't output unnecessary shift
+	 * sequences.
+	 */
+	len = wcrtomb_l(s, wc, ps, loc);
+	if (len == (size_t)-1)	/* error, with errno set */
+		goto out;
+
+	/*
 	 * Make sure we preserve errno on success.
 	 */
 	errno = errno_save;

Index: src/lib/libc/locale/c32rtomb.h
diff -u src/lib/libc/locale/c32rtomb.h:1.1 src/lib/libc/locale/c32rtomb.h:1.2
--- src/lib/libc/locale/c32rtomb.h:1.1	Thu Aug 15 14:16:33 2024
+++ src/lib/libc/locale/c32rtomb.h	Mon Aug 19 16:22:10 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: c32rtomb.h,v 1.1 2024/08/15 14:16:33 riastradh Exp $	*/
+/*	$NetBSD: c32rtomb.h,v 1.2 2024/08/19 16:22:10 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2024 The NetBSD Foundation, Inc.
@@ -30,6 +30,10 @@
 #define	LIB_LIBC_LOCALE_C32RTOMB_H_
 
 struct c32rtombstate {
+	/*
+	 * XXX This needs to match the maximum size of any conversion
+	 * state actually used by wcrtomb_l.
+	 */
 	char		dummy;
 };
 

Index: src/tests/lib/libc/locale/t_c16rtomb.c
diff -u src/tests/lib/libc/locale/t_c16rtomb.c:1.5 src/tests/lib/libc/locale/t_c16rtomb.c:1.6
--- src/tests/lib/libc/locale/t_c16rtomb.c:1.5	Mon Aug 19 16:21:47 2024
+++ src/tests/lib/libc/locale/t_c16rtomb.c	Mon Aug 19 16:22:10 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: t_c16rtomb.c,v 1.5 2024/08/19 16:21:47 riastradh Exp $	*/
+/*	$NetBSD: t_c16rtomb.c,v 1.6 2024/08/19 16:22:10 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2002 Tim J. Robbins
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: t_c16rtomb.c,v 1.5 2024/08/19 16:21:47 riastradh Exp $");
+__RCSID("$NetBSD: t_c16rtomb.c,v 1.6 2024/08/19 16:22:10 riastradh Exp $");
 
 #include <errno.h>
 #include <limits.h>
@@ -155,8 +155,6 @@ ATF_TC_BODY(c16rtomb_iso2022jp_locale_te
 	p = buf;
 	ATF_CHECK_EQ_MSG((n = c16rtomb(p, L'A', &s)), 1, "n=%zu", n); /* 1 */
 	p += 1;
-	atf_tc_expect_fail("PR lib/58612:"
-	    " c8rtomb/c16rtomb/c32rtomb yield suboptimal shift sequences");
 	ATF_CHECK_EQ_MSG((n = c16rtomb(p, 0xa5, &s)), 4, "n=%zu", n); /* 2 */
 	p += 4;
 	ATF_CHECK_EQ_MSG((n = c16rtomb(p, 0xa5, &s)), 1, "n=%zu", n); /* 3 */

Index: src/tests/lib/libc/locale/t_c8rtomb.c
diff -u src/tests/lib/libc/locale/t_c8rtomb.c:1.6 src/tests/lib/libc/locale/t_c8rtomb.c:1.7
--- src/tests/lib/libc/locale/t_c8rtomb.c:1.6	Mon Aug 19 16:21:47 2024
+++ src/tests/lib/libc/locale/t_c8rtomb.c	Mon Aug 19 16:22:10 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: t_c8rtomb.c,v 1.6 2024/08/19 16:21:47 riastradh Exp $	*/
+/*	$NetBSD: t_c8rtomb.c,v 1.7 2024/08/19 16:22:10 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2002 Tim J. Robbins
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: t_c8rtomb.c,v 1.6 2024/08/19 16:21:47 riastradh Exp $");
+__RCSID("$NetBSD: t_c8rtomb.c,v 1.7 2024/08/19 16:22:10 riastradh Exp $");
 
 #include <errno.h>
 #include <limits.h>
@@ -191,8 +191,6 @@ ATF_TC_BODY(c8rtomb_iso2022jp_locale_tes
 	ATF_CHECK_EQ_MSG((n = c8rtomb(p, 'A', &s)), 1, "n=%zu", n); /* 1 */
 	p += 1;
 	ATF_CHECK_EQ_MSG((n = c8rtomb(p, 0xc2, &s)), 0, "n=%zu", n); /* 2 */
-	atf_tc_expect_fail("PR lib/58612:"
-	    " c8rtomb/c16rtomb/c32rtomb yield suboptimal shift sequences");
 	ATF_CHECK_EQ_MSG((n = c8rtomb(p, 0xa5, &s)), 4, "n=%zu", n);
 	p += 4;
 	ATF_CHECK_EQ_MSG((n = c8rtomb(p, 0xc2, &s)), 0, "n=%zu", n); /* 3 */

Reply via email to