Module Name: src
Committed By: riastradh
Date: Tue Aug 20 17:43:09 UTC 2024
Modified Files:
src/lib/libc/locale: mbrtoc32.c mbrtoc32.h
src/tests/lib/libc/locale: t_mbrtoc16.c t_mbrtoc8.c
Log Message:
mbrtoc32(3): Use conversion state to handle shift sequences.
PR lib/58618: mbrtocN(3) fails to keep shift state
To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 src/lib/libc/locale/mbrtoc32.c
cvs rdiff -u -r1.1 -r1.2 src/lib/libc/locale/mbrtoc32.h
cvs rdiff -u -r1.2 -r1.3 src/tests/lib/libc/locale/t_mbrtoc16.c \
src/tests/lib/libc/locale/t_mbrtoc8.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/lib/libc/locale/mbrtoc32.c
diff -u src/lib/libc/locale/mbrtoc32.c:1.7 src/lib/libc/locale/mbrtoc32.c:1.8
--- src/lib/libc/locale/mbrtoc32.c:1.7 Sun Aug 18 20:06:05 2024
+++ src/lib/libc/locale/mbrtoc32.c Tue Aug 20 17:43:09 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: mbrtoc32.c,v 1.7 2024/08/18 20:06:05 rillig Exp $ */
+/* $NetBSD: mbrtoc32.c,v 1.8 2024/08/20 17:43:09 riastradh Exp $ */
/*-
* Copyright (c) 2024 The NetBSD Foundation, Inc.
@@ -52,7 +52,7 @@
*/
#include <sys/cdefs.h>
-__RCSID("$NetBSD: mbrtoc32.c,v 1.7 2024/08/18 20:06:05 rillig Exp $");
+__RCSID("$NetBSD: mbrtoc32.c,v 1.8 2024/08/20 17:43:09 riastradh Exp $");
#include "namespace.h"
@@ -102,10 +102,17 @@ mbrtoc32_l(char32_t *restrict pc32, cons
mbstate_t *restrict ps, locale_t restrict loc)
{
static mbstate_t psbuf;
- struct mbrtoc32state *S;
struct _citrus_iconv *iconv = NULL;
- size_t len;
+ wchar_t wc;
+ mbstate_t wcrtombstate = {0};
+ char mb[MB_LEN_MAX];
+ int mblen;
+ char utf32le[MB_LEN_MAX];
+ const char *src;
+ char *dst;
+ size_t srcleft, dstleft, inval;
char32_t c32;
+ size_t len;
int error, errno_save;
/*
@@ -141,11 +148,6 @@ mbrtoc32_l(char32_t *restrict pc32, cons
}
/*
- * Get the private conversion state.
- */
- S = (struct mbrtoc32state *)(void *)ps;
-
- /*
* If input length is zero, the result is always incomplete by
* definition. Don't bother with iconv -- we'd have to
* disentangle truncated outputs.
@@ -156,12 +158,6 @@ mbrtoc32_l(char32_t *restrict pc32, cons
}
/*
- * Reset the destination buffer if this is the initial state.
- */
- if (S->dstleft == 0)
- S->dstleft = sizeof(S->dstbuf);
-
- /*
* Open an iconv handle to convert locale-dependent multibyte
* input to UTF-32LE.
*/
@@ -173,47 +169,55 @@ mbrtoc32_l(char32_t *restrict pc32, cons
}
/*
- * Try to iconv a minimal prefix. If we succeed, set len to
- * the length consumed and goto ok.
+ * Consume the next locale-dependent wide character. If no
+ * wide character can be obtained, stop here.
+ */
+ len = mbrtowc_l(&wc, s, n, ps, loc);
+ switch (len) {
+ case 0: /* NUL */
+ if (pc32)
+ *pc32 = 0;
+ goto out;
+ case (size_t)-2: /* still incomplete after n bytes */
+ case (size_t)-1: /* error */
+ goto out;
+ default: /* consumed len bytes of input */
+ break;
+ }
+
+ /*
+ * We consumed a wide character from the input. Convert it to
+ * a multibyte sequence _in the initial conversion state_, so
+ * we can pass that through iconv to get a Unicode scalar
+ * value.
*/
- for (len = 0; len < MIN(n, sizeof(S->srcbuf) - S->nsrc);) {
- const char *src = S->srcbuf;
- size_t srcleft;
- char *dst = S->dstbuf + sizeof(S->dstbuf) - S->dstleft;
- size_t inval;
-
- S->srcbuf[S->nsrc++] = s[len++];
- srcleft = S->nsrc;
-
- error = _citrus_iconv_convert(iconv,
- &src, &srcleft,
- &dst, &S->dstleft,
- _CITRUS_ICONV_F_HIDE_INVALID, &inval);
- if (error != EINVAL) {
- if (error == 0)
- break;
- errno = error;
- len = (size_t)-1;
- goto out;
- }
+ if ((mblen = wcrtomb_l(mb, wc, &wcrtombstate, loc)) == -1) {
+ len = (size_t)-1;
+ goto out;
}
/*
- * If it is still incomplete after trying the whole input
- * buffer, return (size_t)-2 and let the caller try again.
+ * Convert the multibyte sequence to UTF-16LE.
*/
+ src = mb;
+ srcleft = (size_t)mblen;
+ dst = utf32le;
+ dstleft = sizeof(utf32le);
+ error = _citrus_iconv_convert(iconv, &src, &srcleft, &dst, &dstleft,
+ _CITRUS_ICONV_F_HIDE_INVALID, &inval);
if (error) {
- len = (size_t)-2;
+ errno = error;
+ len = (size_t)-1;
goto out;
}
/*
- * Successfully converted a minimal byte sequence, which should
- * produce exactly one UTF-32 code unit, encoded in
- * little-endian, representing a code point. Get the code
+ * Successfully converted the multibyte sequence to UTF-16LE,
+ * which should produce exactly one UTF-32 code unit, encoded
+ * in little-endian, representing a code point. Get the code
* point.
*/
- c32 = le32dec(S->dstbuf);
+ c32 = le32dec(utf32le);
/*
* Reject surrogate code points. We only deal in scalar
@@ -245,11 +249,7 @@ mbrtoc32_l(char32_t *restrict pc32, cons
*/
errno = errno_save;
-out: if (len != (size_t)-2) {
- S->nsrc = 0;
- memset(S, 0, sizeof(*S)); /* paranoia */
- }
- errno_save = errno;
+out: errno_save = errno;
_citrus_iconv_close(iconv);
errno = errno_save;
return len;
Index: src/lib/libc/locale/mbrtoc32.h
diff -u src/lib/libc/locale/mbrtoc32.h:1.1 src/lib/libc/locale/mbrtoc32.h:1.2
--- src/lib/libc/locale/mbrtoc32.h:1.1 Thu Aug 15 14:16:33 2024
+++ src/lib/libc/locale/mbrtoc32.h Tue Aug 20 17:43:09 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: mbrtoc32.h,v 1.1 2024/08/15 14:16:33 riastradh Exp $ */
+/* $NetBSD: mbrtoc32.h,v 1.2 2024/08/20 17:43:09 riastradh Exp $ */
/*-
* Copyright (c) 2024 The NetBSD Foundation, Inc.
@@ -29,14 +29,12 @@
#ifndef LIB_LIBC_LOCALE_MBRTOC32_H_
#define LIB_LIBC_LOCALE_MBRTOC32_H_
-#include <limits.h>
-#include <uchar.h>
-
struct mbrtoc32state {
- char srcbuf[MB_LEN_MAX];
- size_t nsrc;
- char dstbuf[4];
- size_t dstleft;
+ /*
+ * XXX This needs to match the maximum size of any conversion
+ * state actually used by mbrtowc_l.
+ */
+ char dummy;
};
#endif /* LIB_LIBC_LOCALE_MBRTOC32_H_ */
Index: src/tests/lib/libc/locale/t_mbrtoc16.c
diff -u src/tests/lib/libc/locale/t_mbrtoc16.c:1.2 src/tests/lib/libc/locale/t_mbrtoc16.c:1.3
--- src/tests/lib/libc/locale/t_mbrtoc16.c:1.2 Mon Aug 19 16:24:05 2024
+++ src/tests/lib/libc/locale/t_mbrtoc16.c Tue Aug 20 17:43:09 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: t_mbrtoc16.c,v 1.2 2024/08/19 16:24:05 riastradh Exp $ */
+/* $NetBSD: t_mbrtoc16.c,v 1.3 2024/08/20 17:43:09 riastradh Exp $ */
/*-
* Copyright (c) 2002 Tim J. Robbins
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__RCSID("$NetBSD: t_mbrtoc16.c,v 1.2 2024/08/19 16:24:05 riastradh Exp $");
+__RCSID("$NetBSD: t_mbrtoc16.c,v 1.3 2024/08/20 17:43:09 riastradh Exp $");
#include <errno.h>
#include <inttypes.h>
@@ -171,22 +171,16 @@ ATF_TC_BODY(mbrtoc16_iso2022jp_locale_te
/* Incomplete character sequence (shift sequence only). */
memset(&s, 0, sizeof(s));
c16 = 0;
- atf_tc_expect_fail("PR lib/58618:"
- " mbrtocN(3) fails to keep shift state");
ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(J", 3, &s)), (size_t)-2,
"n=%zu", n);
- atf_tc_expect_pass();
ATF_CHECK_EQ_MSG(c16, 0, "c16=U+%04"PRIx16, (uint16_t)c16);
/* Same as above, but complete (U+00A5 YEN SIGN). */
memset(&s, 0, sizeof(s));
c16 = 0;
- atf_tc_expect_fail("PR lib/58618:"
- " mbrtocN(3) fails to keep shift state");
ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(J\x5c", 4, &s)), 4,
"n=%zu", n);
ATF_CHECK_EQ_MSG(c16, 0xa5, "c16=U+%04"PRIx16, (uint16_t)c16);
- atf_tc_expect_pass();
/* Test restarting behaviour. */
memset(&s, 0, sizeof(s));
@@ -194,11 +188,8 @@ ATF_TC_BODY(mbrtoc16_iso2022jp_locale_te
ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(", 2, &s)), (size_t)-2,
"n=%zu", n);
ATF_CHECK_EQ_MSG(c16, 0, "c16=U+%04"PRIx16, (uint16_t)c16);
- atf_tc_expect_fail("PR lib/58618:"
- " mbrtocN(3) fails to keep shift state");
ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "J\x5c", 2, &s)), 2, "n=%zu", n);
ATF_CHECK_EQ_MSG(c16, 0xa5, "c16=U+%04"PRIx16, (uint16_t)c16);
- atf_tc_expect_pass();
/*
* Test shift sequence state in various increments:
@@ -215,8 +206,6 @@ ATF_TC_BODY(mbrtoc16_iso2022jp_locale_te
"n=%zu", n);
ATF_CHECK_EQ_MSG(c16, L'A', "c16=U+%04"PRIx16, (uint16_t)c16);
c16 = 0;
- atf_tc_expect_fail("PR lib/58618:"
- " mbrtocN(3) fails to keep shift state");
ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(J", 3, &s)), (size_t)-2,
"n=%zu", n);
ATF_CHECK_EQ_MSG(c16, 0, "c16=U+%04"PRIx16, (uint16_t)c16);
@@ -240,7 +229,6 @@ ATF_TC_BODY(mbrtoc16_iso2022jp_locale_te
ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x22\x1b(B\x00", 5, &s)), 1,
"n=%zu", n);
ATF_CHECK_EQ_MSG(c16, 0x30a2, "c16=U+%04"PRIx16, (uint16_t)c16);
- atf_tc_expect_pass();
c16 = 0;
ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(", 2, &s)), (size_t)-2,
"n=%zu", n);
Index: src/tests/lib/libc/locale/t_mbrtoc8.c
diff -u src/tests/lib/libc/locale/t_mbrtoc8.c:1.2 src/tests/lib/libc/locale/t_mbrtoc8.c:1.3
--- src/tests/lib/libc/locale/t_mbrtoc8.c:1.2 Mon Aug 19 16:24:05 2024
+++ src/tests/lib/libc/locale/t_mbrtoc8.c Tue Aug 20 17:43:09 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: t_mbrtoc8.c,v 1.2 2024/08/19 16:24:05 riastradh Exp $ */
+/* $NetBSD: t_mbrtoc8.c,v 1.3 2024/08/20 17:43:09 riastradh Exp $ */
/*-
* Copyright (c) 2002 Tim J. Robbins
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__RCSID("$NetBSD: t_mbrtoc8.c,v 1.2 2024/08/19 16:24:05 riastradh Exp $");
+__RCSID("$NetBSD: t_mbrtoc8.c,v 1.3 2024/08/20 17:43:09 riastradh Exp $");
#include <errno.h>
#include <inttypes.h>
@@ -172,25 +172,19 @@ ATF_TC_BODY(mbrtoc8_iso2022jp_locale_tes
/* Incomplete character sequence (shift sequence only). */
memset(&s, 0, sizeof(s));
c8 = 0;
- atf_tc_expect_fail("PR lib/58618:"
- " mbrtocN(3) fails to keep shift state");
ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(J", 3, &s)), (size_t)-2,
"n=%zu", n);
- atf_tc_expect_pass();
ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8);
/* Same as above, but complete (U+00A5 YEN SIGN). */
memset(&s, 0, sizeof(s));
c8 = 0;
- atf_tc_expect_fail("PR lib/58618:"
- " mbrtocN(3) fails to keep shift state");
ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(J\x5c", 4, &s)), 4,
"n=%zu", n);
ATF_CHECK_EQ_MSG(c8, 0xc2, "c8=0x%"PRIx8, (uint8_t)c8);
ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
"n=%zu", n);
ATF_CHECK_EQ_MSG(c8, 0xa5, "c8=0x%"PRIx8, (uint8_t)c8);
- atf_tc_expect_pass();
/* Test restarting behaviour. */
memset(&s, 0, sizeof(s));
@@ -198,14 +192,11 @@ ATF_TC_BODY(mbrtoc8_iso2022jp_locale_tes
ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(", 2, &s)), (size_t)-2,
"n=%zu", n);
ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8);
- atf_tc_expect_fail("PR lib/58618:"
- " mbrtocN(3) fails to keep shift state");
ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "J\x5c", 2, &s)), 2, "n=%zu", n);
ATF_CHECK_EQ_MSG(c8, 0xc2, "c8=0x%"PRIx8, (uint8_t)c8);
ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
"n=%zu", n);
ATF_CHECK_EQ_MSG(c8, 0xa5, "c8=0x%"PRIx8, (uint8_t)c8);
- atf_tc_expect_pass();
/*
* Test shift sequence state in various increments:
@@ -221,8 +212,6 @@ ATF_TC_BODY(mbrtoc8_iso2022jp_locale_tes
ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "A\x1b(J", 4, &s)), 1, "n=%zu", n);
ATF_CHECK_EQ_MSG(c8, 'A', "c8=0x%"PRIx8, (uint8_t)c8);
c8 = 0;
- atf_tc_expect_fail("PR lib/58618:"
- " mbrtocn(3) fails to keep shift state");
ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(J", 3, &s)), (size_t)-2,
"n=%zu", n);
ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8);
@@ -264,7 +253,6 @@ ATF_TC_BODY(mbrtoc8_iso2022jp_locale_tes
ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(", 2, &s)), (size_t)-3,
"n=%zu", n);
ATF_CHECK_EQ_MSG(c8, 0xa2, "c8=0x%"PRIx8, (uint8_t)c8);
- atf_tc_expect_pass();
c8 = 0;
ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(", 2, &s)), (size_t)-2,
"n=%zu", n);