Module Name: src Committed By: riastradh Date: Thu Aug 15 21:19:46 UTC 2024
Modified Files: src/distrib/sets/lists/comp: mi src/distrib/sets/lists/debug: mi src/distrib/sets/lists/tests: mi src/include: uchar.h src/lib/libc/locale: Makefile.inc src/share/man/man3: uchar.3 src/tests/lib/libc/locale: Makefile Added Files: src/lib/libc/locale: c8rtomb.3 c8rtomb.c mbrtoc8.3 mbrtoc8.c src/tests/lib/libc/locale: t_c8rtomb.c t_mbrtoc8.c Log Message: libc: New functions c8rtomb(3) and mbrtoc8(3). New in C23, for converting from UTF-8 to locale-dependent multibyte sequences (c8rtomb) or vice versa (mbrtoc8), along with the new type char8_t. Conditional on either: - _NETBSD_SOURCE - _ISOC23_SOURCE - __STDC_VERSION__ >= 202311L (Riding the libc minor bump from this morning for the UTF-16/UTF-32 versions from C11.) PR standards/58601: uchar.h C23 compliance: char8_t, mbrtoc8, c8rtomb To generate a diff of this commit: cvs rdiff -u -r1.2469 -r1.2470 src/distrib/sets/lists/comp/mi cvs rdiff -u -r1.443 -r1.444 src/distrib/sets/lists/debug/mi cvs rdiff -u -r1.1331 -r1.1332 src/distrib/sets/lists/tests/mi cvs rdiff -u -r1.1 -r1.2 src/include/uchar.h cvs rdiff -u -r1.66 -r1.67 src/lib/libc/locale/Makefile.inc cvs rdiff -u -r0 -r1.1 src/lib/libc/locale/c8rtomb.3 \ src/lib/libc/locale/c8rtomb.c src/lib/libc/locale/mbrtoc8.3 \ src/lib/libc/locale/mbrtoc8.c cvs rdiff -u -r1.1 -r1.2 src/share/man/man3/uchar.3 cvs rdiff -u -r1.17 -r1.18 src/tests/lib/libc/locale/Makefile cvs rdiff -u -r0 -r1.1 src/tests/lib/libc/locale/t_c8rtomb.c \ src/tests/lib/libc/locale/t_mbrtoc8.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/distrib/sets/lists/comp/mi diff -u src/distrib/sets/lists/comp/mi:1.2469 src/distrib/sets/lists/comp/mi:1.2470 --- src/distrib/sets/lists/comp/mi:1.2469 Thu Aug 15 14:16:32 2024 +++ src/distrib/sets/lists/comp/mi Thu Aug 15 21:19:44 2024 @@ -1,4 +1,4 @@ -# $NetBSD: mi,v 1.2469 2024/08/15 14:16:32 riastradh Exp $ +# $NetBSD: mi,v 1.2470 2024/08/15 21:19:44 riastradh Exp $ # # Note: don't delete entries from here - mark them as "obsolete" instead. ./etc/mtree/set.comp comp-sys-root @@ -6844,6 +6844,7 @@ ./usr/share/man/cat3/bzero.0 comp-c-catman .cat ./usr/share/man/cat3/c16rtomb.0 comp-c-catman .cat ./usr/share/man/cat3/c32rtomb.0 comp-c-catman .cat +./usr/share/man/cat3/c8rtomb.0 comp-c-catman .cat ./usr/share/man/cat3/cabs.0 comp-c-catman complex,.cat ./usr/share/man/cat3/cabsf.0 comp-c-catman complex,.cat ./usr/share/man/cat3/cabsl.0 comp-c-catman complex,.cat @@ -9208,6 +9209,7 @@ ./usr/share/man/cat3/mbrlen.0 comp-c-catman .cat ./usr/share/man/cat3/mbrtoc16.0 comp-c-catman .cat ./usr/share/man/cat3/mbrtoc32.0 comp-c-catman .cat +./usr/share/man/cat3/mbrtoc8.0 comp-c-catman .cat ./usr/share/man/cat3/mbrtowc.0 comp-c-catman .cat ./usr/share/man/cat3/mbsinit.0 comp-c-catman .cat ./usr/share/man/cat3/mbsrtowcs.0 comp-c-catman .cat @@ -15409,6 +15411,7 @@ ./usr/share/man/html3/bzero.html comp-c-htmlman html ./usr/share/man/html3/c16rtomb.html comp-c-htmlman html ./usr/share/man/html3/c32rtomb.html comp-c-htmlman html +./usr/share/man/html3/c8rtomb.html comp-c-htmlman html ./usr/share/man/html3/cabs.html comp-c-htmlman complex,html ./usr/share/man/html3/cabsf.html comp-c-htmlman complex,html ./usr/share/man/html3/cabsl.html comp-c-htmlman complex,html @@ -17705,6 +17708,7 @@ ./usr/share/man/html3/mbrlen.html comp-c-htmlman html ./usr/share/man/html3/mbrtoc16.html comp-c-htmlman html ./usr/share/man/html3/mbrtoc32.html comp-c-htmlman html +./usr/share/man/html3/mbrtoc8.html comp-c-htmlman html ./usr/share/man/html3/mbrtowc.html comp-c-htmlman html ./usr/share/man/html3/mbsinit.html comp-c-htmlman html ./usr/share/man/html3/mbsrtowcs.html comp-c-htmlman html @@ -23833,6 +23837,7 @@ ./usr/share/man/man3/bzero.3 comp-c-man .man ./usr/share/man/man3/c16rtomb.3 comp-c-man .man ./usr/share/man/man3/c32rtomb.3 comp-c-man .man +./usr/share/man/man3/c8rtomb.3 comp-c-man .man ./usr/share/man/man3/cabs.3 comp-c-man complex,.man ./usr/share/man/man3/cabsf.3 comp-c-man complex,.man ./usr/share/man/man3/cabsl.3 comp-c-man complex,.man @@ -26209,6 +26214,7 @@ ./usr/share/man/man3/mbrlen.3 comp-c-man .man ./usr/share/man/man3/mbrtoc16.3 comp-c-man .man ./usr/share/man/man3/mbrtoc32.3 comp-c-man .man +./usr/share/man/man3/mbrtoc8.3 comp-c-man .man ./usr/share/man/man3/mbrtowc.3 comp-c-man .man ./usr/share/man/man3/mbsinit.3 comp-c-man .man ./usr/share/man/man3/mbsrtowcs.3 comp-c-man .man Index: src/distrib/sets/lists/debug/mi diff -u src/distrib/sets/lists/debug/mi:1.443 src/distrib/sets/lists/debug/mi:1.444 --- src/distrib/sets/lists/debug/mi:1.443 Thu Aug 15 14:16:32 2024 +++ src/distrib/sets/lists/debug/mi Thu Aug 15 21:19:44 2024 @@ -1,4 +1,4 @@ -# $NetBSD: mi,v 1.443 2024/08/15 14:16:32 riastradh Exp $ +# $NetBSD: mi,v 1.444 2024/08/15 21:19:44 riastradh Exp $ # ./etc/mtree/set.debug comp-sys-root ./usr/lib comp-sys-usr compatdir @@ -2060,6 +2060,7 @@ ./usr/libdata/debug/usr/tests/lib/libc/locale/t_btowc.debug tests-lib-debug debug,atf,compattestfile ./usr/libdata/debug/usr/tests/lib/libc/locale/t_c16rtomb.debug tests-lib-debug debug,atf,compattestfile ./usr/libdata/debug/usr/tests/lib/libc/locale/t_c32rtomb.debug tests-lib-debug debug,atf,compattestfile +./usr/libdata/debug/usr/tests/lib/libc/locale/t_c8rtomb.debug tests-lib-debug debug,atf,compattestfile ./usr/libdata/debug/usr/tests/lib/libc/locale/t_ctype1.debug tests-obsolete obsolete,compattestfile ./usr/libdata/debug/usr/tests/lib/libc/locale/t_ctype2.debug tests-obsolete obsolete,compattestfile ./usr/libdata/debug/usr/tests/lib/libc/locale/t_digittoint.debug tests-lib-debug debug,atf,compattestfile @@ -2067,6 +2068,7 @@ ./usr/libdata/debug/usr/tests/lib/libc/locale/t_io.debug tests-lib-debug debug,atf,compattestfile ./usr/libdata/debug/usr/tests/lib/libc/locale/t_mbrtoc16.debug tests-lib-debug debug,atf,compattestfile ./usr/libdata/debug/usr/tests/lib/libc/locale/t_mbrtoc32.debug tests-lib-debug debug,atf,compattestfile +./usr/libdata/debug/usr/tests/lib/libc/locale/t_mbrtoc8.debug tests-lib-debug debug,atf,compattestfile ./usr/libdata/debug/usr/tests/lib/libc/locale/t_mbrtowc.debug tests-lib-debug debug,atf,compattestfile ./usr/libdata/debug/usr/tests/lib/libc/locale/t_mbsnrtowcs.debug tests-lib-debug debug,atf,compattestfile ./usr/libdata/debug/usr/tests/lib/libc/locale/t_mbstowcs.debug tests-lib-debug debug,atf,compattestfile Index: src/distrib/sets/lists/tests/mi diff -u src/distrib/sets/lists/tests/mi:1.1331 src/distrib/sets/lists/tests/mi:1.1332 --- src/distrib/sets/lists/tests/mi:1.1331 Thu Aug 15 14:16:33 2024 +++ src/distrib/sets/lists/tests/mi Thu Aug 15 21:19:45 2024 @@ -1,4 +1,4 @@ -# $NetBSD: mi,v 1.1331 2024/08/15 14:16:33 riastradh Exp $ +# $NetBSD: mi,v 1.1332 2024/08/15 21:19:45 riastradh Exp $ # # Note: don't delete entries from here - mark them as "obsolete" instead. # @@ -3075,6 +3075,7 @@ ./usr/tests/lib/libc/locale/t_btowc tests-lib-tests compattestfile,atf ./usr/tests/lib/libc/locale/t_c16rtomb tests-lib-tests compattestfile,atf ./usr/tests/lib/libc/locale/t_c32rtomb tests-lib-tests compattestfile,atf +./usr/tests/lib/libc/locale/t_c8rtomb tests-lib-tests compattestfile,atf ./usr/tests/lib/libc/locale/t_ctype1 tests-obsolete obsolete ./usr/tests/lib/libc/locale/t_ctype2 tests-obsolete obsolete ./usr/tests/lib/libc/locale/t_digittoint tests-lib-tests compattestfile,atf @@ -3082,6 +3083,7 @@ ./usr/tests/lib/libc/locale/t_io tests-lib-tests compattestfile,atf ./usr/tests/lib/libc/locale/t_mbrtoc16 tests-lib-tests compattestfile,atf ./usr/tests/lib/libc/locale/t_mbrtoc32 tests-lib-tests compattestfile,atf +./usr/tests/lib/libc/locale/t_mbrtoc8 tests-lib-tests compattestfile,atf ./usr/tests/lib/libc/locale/t_mbrtowc tests-lib-tests compattestfile,atf ./usr/tests/lib/libc/locale/t_mbsnrtowcs tests-lib-tests compattestfile,atf ./usr/tests/lib/libc/locale/t_mbstowcs tests-lib-tests compattestfile,atf Index: src/include/uchar.h diff -u src/include/uchar.h:1.1 src/include/uchar.h:1.2 --- src/include/uchar.h:1.1 Thu Aug 15 13:14:44 2024 +++ src/include/uchar.h Thu Aug 15 21:19:45 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: uchar.h,v 1.1 2024/08/15 13:14:44 riastradh Exp $ */ +/* $NetBSD: uchar.h,v 1.2 2024/08/15 21:19:45 riastradh Exp $ */ /*- * Copyright (c) 2024 The NetBSD Foundation, Inc. @@ -28,9 +28,8 @@ /* * C11, 7.28: Unicode utilities <uchar.h> - * - * `1. The header <uchar.h> declares types and functions for - * manipulating Unicode characters.' + * C17, 7.28: Unicode utilities <uchar.h> (unchanged from C11) + * C23, 7.30: Unicode utilities <uchar.h> */ #ifndef _UCHAR_H @@ -39,7 +38,20 @@ #include <sys/ansi.h> /* - * `2. The types declared are mbstate_t (described in 7.30.1) and + * C23 `2. The macro + * + * __STDC_VERSION_UCHAR_H__ + * + * is an integer constant expression with a value equivalent + * to 202311L.' + */ +#if defined(_NETBSD_SOURCE) || defined(_ISOC23_SOURCE) || \ + __STDC_VERSION__ - 0 >= 202311L +#define __STDC_VERSION_UCHAR_H__ 202311L +#endif + +/* + * C11 `2. The types declared are mbstate_t (described in 7.30.1) and * size_t (described in 7.19); * * char16_t @@ -65,6 +77,17 @@ typedef _BSD_SIZE_T_ size_t; #undef _BSD_SIZE_T_ #endif +/* + * C23 `char8_t...is an unsigned integer type used for 8-bit + * characters and is the same type as unsigned char' + */ +#if defined(_NETBSD_SOURCE) || defined(_ISOC23_SOURCE) || \ + __STDC_VERSION__ - 0 >= 202311L +#if !defined(__cpp_char8_t) || __cpp_char8_t < 201811L +typedef unsigned char char8_t; +#endif +#endif + #if !defined(__cplusplus) || __cplusplus < 201103L typedef __UINT_LEAST16_TYPE__ char16_t; typedef __UINT_LEAST32_TYPE__ char32_t; @@ -72,6 +95,12 @@ typedef __UINT_LEAST32_TYPE__ char32_t; __BEGIN_DECLS +#if defined(_NETBSD_SOURCE) || defined(_ISOC23_SOURCE) || \ + __STDC_VERSION__ - 0 >= 202311L +size_t mbrtoc8(char8_t *__restrict, const char *__restrict, size_t, + mbstate_t *__restrict); +size_t c8rtomb(char *__restrict, char8_t, mbstate_t *__restrict); +#endif size_t mbrtoc16(char16_t *__restrict, const char *__restrict, size_t, mbstate_t *__restrict); size_t c16rtomb(char *__restrict, char16_t, mbstate_t *__restrict); Index: src/lib/libc/locale/Makefile.inc diff -u src/lib/libc/locale/Makefile.inc:1.66 src/lib/libc/locale/Makefile.inc:1.67 --- src/lib/libc/locale/Makefile.inc:1.66 Thu Aug 15 14:16:33 2024 +++ src/lib/libc/locale/Makefile.inc Thu Aug 15 21:19:45 2024 @@ -1,5 +1,5 @@ # from: @(#)Makefile.inc 5.1 (Berkeley) 2/18/91 -# $NetBSD: Makefile.inc,v 1.66 2024/08/15 14:16:33 riastradh Exp $ +# $NetBSD: Makefile.inc,v 1.67 2024/08/15 21:19:45 riastradh Exp $ # locale sources .PATH: ${ARCHDIR}/locale ${.CURDIR}/locale @@ -13,8 +13,10 @@ SRCS+= setlocale.c __mb_cur_max.c \ SRCS+= c16rtomb.c SRCS+= c32rtomb.c +SRCS+= c8rtomb.c SRCS+= mbrtoc16.c SRCS+= mbrtoc32.c +SRCS+= mbrtoc8.c CPPFLAGS.c32rtomb.c+= -I${LIBCDIR}/citrus CPPFLAGS.mbrtoc32.c+= -I${LIBCDIR}/citrus @@ -38,8 +40,10 @@ MAN+= btowc.3 mbrtowc.3 mbsrtowcs.3 \ MAN+= c16rtomb.3 MAN+= c32rtomb.3 +MAN+= c8rtomb.3 MAN+= mbrtoc16.3 MAN+= mbrtoc32.3 +MAN+= mbrtoc8.3 MAN+= iswalnum.3 wctype.3 iswctype.3 \ towlower.3 wctrans.3 towctrans.3 \ Index: src/share/man/man3/uchar.3 diff -u src/share/man/man3/uchar.3:1.1 src/share/man/man3/uchar.3:1.2 --- src/share/man/man3/uchar.3:1.1 Thu Aug 15 14:16:34 2024 +++ src/share/man/man3/uchar.3 Thu Aug 15 21:19:45 2024 @@ -1,4 +1,4 @@ -.\" $NetBSD: uchar.3,v 1.1 2024/08/15 14:16:34 riastradh Exp $ +.\" $NetBSD: uchar.3,v 1.2 2024/08/15 21:19:45 riastradh Exp $ .\" .\" Copyright (c) 2024 The NetBSD Foundation, Inc. .\" All rights reserved. @@ -24,7 +24,7 @@ .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" -.Dd August 14, 2024 +.Dd August 15, 2024 .Dt UCHAR 3 .Os .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" @@ -43,6 +43,12 @@ units. .\"""""""""""""""""""""""""""""""""""""" .Ss Types .Bl -tag -width ".Vt char32_t" +.It Vt char8_t +(C23) +Unsigned integer type for UTF-8 code units. +.Pp +Same type as +.Vt unsigned char . .It Vt char16_t Unsigned integer type for UTF-16 code units. .Pp @@ -86,17 +92,21 @@ and The .In uchar.h header file declares the functions +.Xr mbrtoc8 3 , +.Xr c8rtomb 3 , .Xr mbrtoc16 3 , .Xr c16rtomb 3 , .Xr mbrtoc32 3 , and .Xr c32rtomb 3 -for conversion between multibyte sequences and UTF-16/UTF-32 code +for conversion between multibyte sequences and UTF-8/UTF-16/UTF-32 code units. .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" .Sh SEE ALSO +.Xr c8rtomb 3 , .Xr c16rtomb 3 , .Xr c32rtomb 3 , +.Xr mbrtoc8 3 , .Xr mbrtoc16 3 , .Xr mbrtoc32 3 .Rs @@ -115,12 +125,22 @@ units. .%I Internet Engineering Task Force .%U https://datatracker.ietf.org/doc/html/rfc2781 .Re +.Rs +.%A F. Yergeau +.%T UTF-8, a transformation format of ISO 10646 +.%R RFC 3629 +.%D November 2003 +.%I Internet Engineering Task Force +.%U https://datatracker.ietf.org/doc/html/rfc3629 +.Re .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" .Sh STANDARDS The .In uchar.h header file conforms to .St -isoC-2011 +.\" .St -isoC-2023 +.\" .\" XXX PR misc/58600: man pages lack C17, C23, C++98, C++03, C++11, C++17, C++20, C++23 citation syntax and .St -p1003.1-2024 . .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" Index: src/tests/lib/libc/locale/Makefile diff -u src/tests/lib/libc/locale/Makefile:1.17 src/tests/lib/libc/locale/Makefile:1.18 --- src/tests/lib/libc/locale/Makefile:1.17 Thu Aug 15 14:16:34 2024 +++ src/tests/lib/libc/locale/Makefile Thu Aug 15 21:19:45 2024 @@ -1,4 +1,4 @@ -# $NetBSD: Makefile,v 1.17 2024/08/15 14:16:34 riastradh Exp $ +# $NetBSD: Makefile,v 1.18 2024/08/15 21:19:45 riastradh Exp $ .include <bsd.own.mk> @@ -7,11 +7,13 @@ TESTSDIR= ${TESTSBASE}/lib/libc/locale TESTS_C+= t_btowc TESTS_C+= t_c16rtomb TESTS_C+= t_c32rtomb +TESTS_C+= t_c8rtomb TESTS_C+= t_digittoint TESTS_C+= t_ducet TESTS_C+= t_io TESTS_C+= t_mbrtoc16 TESTS_C+= t_mbrtoc32 +TESTS_C+= t_mbrtoc8 TESTS_C+= t_mbrtowc TESTS_C+= t_mbsnrtowcs TESTS_C+= t_mbstowcs Added files: Index: src/lib/libc/locale/c8rtomb.3 diff -u /dev/null src/lib/libc/locale/c8rtomb.3:1.1 --- /dev/null Thu Aug 15 21:19:46 2024 +++ src/lib/libc/locale/c8rtomb.3 Thu Aug 15 21:19:45 2024 @@ -0,0 +1,191 @@ +.\" $NetBSD: c8rtomb.3,v 1.1 2024/08/15 21:19:45 riastradh Exp $ +.\" +.\" Copyright (c) 2024 The NetBSD Foundation, Inc. +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS +.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS +.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.\" POSSIBILITY OF SUCH DAMAGE. +.\" +.Dd August 15, 2024 +.Dt C8RTOMB 3 +.Os +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh NAME +.Nm c8rtomb +.Nd Restartable UTF-8 code unit to multibyte conversion +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh LIBRARY +.Lb libc +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh SYNOPSIS +.In uchar.h +.Ft size_t +.Fn c8rtomb "char * restrict s" \ +"char8_t c8" \ +"mbstate_t * restrict ps" +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh DESCRIPTION +The +.Nm +function attempts to encode Unicode input as a multibyte character +sequence output at +.Fa s +in the current locale, writing anywhere between zero and +.Dv MB_CUR_MAX +bytes, inclusive, to +.Fa s , +depending on the inputs and conversion state +.Fa ps . +.Pp +The input +.Fa c8 +is a UTF-8 code unit. +Successive calls to +.Nm +must provide well-formed UTF-8 code unit sequences. +If +.Fa c8 , +when appended to the sequence of code units passed in previous calls +with the same state +.Fa ps , +does not form a well-formed UTF-8 code unit sequence, then +.Nm +will return +.Li (size_t)-1 +to denote failure with +.Xr errno 2 +set to +.Er EILSEQ . +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh RETURN VALUES +The +.Nm +function returns the number of bytes written to +.Fa s +on success, or sets +.Xr errno 2 +and returns +.Li "(size_t)-1" +on failure. +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh EXAMPLES +Convert a UTF-8 code unit sequence to a multibyte string, +NUL-terminate it, and print it: +.Bd -literal -offset indent +char8_t c8[] = { 0xf0, 0x9f, 0x92, 0xa9 }; +char buf[__arraycount(c8)*MB_CUR_MAX + 1], *s = buf; +size_t i; +mbstate_t mbs = {0}; /* initial conversion state */ + +for (i = 0; i < __arraycount(c8); i++) { + size_t len; + + len = c8rtomb(s, c8[i], &mbs); + if (len == (size_t)-1) + err(1, "c8rtomb"); + assert(len < sizeof(buf) - (s - buf)); + s += len; +} +*s = '\e0'; /* NUL-terminate */ +printf("%s\en", buf); +.Ed +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh ERRORS +.Bl -tag -width ".Bq Er EILSEQ" +.It Bq Er EILSEQ +A surrogate code point was passed as +.Fa c8 +when it is inappropriate. +.It Bq Er EILSEQ +The Unicode scalar value requested cannot be encoded as a multibyte +sequence in the current locale. +.It Bq Er EIO +An error occurred in loading the locale's character conversions. +.El +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh SEE ALSO +.Xr c16rtomb 3 , +.Xr c32rtomb 3 , +.Xr mbrtoc8 3 , +.Xr mbrtoc16 3 , +.Xr mbrtoc32 3 , +.Xr uchar 3 +.Rs +.%B The Unicode Standard +.%O Version 15.0 \(em Core Specification +.%Q The Unicode Consortium +.%D September 2022 +.%U https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf +.Re +.Rs +.%A F. Yergeau +.%T UTF-8, a transformation format of ISO 10646 +.%R RFC 3629 +.%D November 2003 +.%I Internet Engineering Task Force +.%U https://datatracker.ietf.org/doc/html/rfc3629 +.Re +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.\" .Sh STANDARDS +.\" The +.\" .Nm +.\" function conforms to +.\" .St -isoC-2023 . +.\" .\" XXX PR misc/58600: man pages lack C17, C23, C++98, C++03, C++11, C++17, C++20, C++23 citation syntax +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh HISTORY +The +.Nm +function first appeared in +.Nx 11.0 . +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh BUGS +It is not clear from the standard how +.Nm +is supposed to behave when given an incomplete UTF-8 code unit sequence +followed by a NUL: +.Bd -literal -offset indent +c8rtomb(s, 0xf0, ps); +c8rtomb(s, 0x9f, ps); +c8rtomb(s, 0x92, ps); +c8rtomb(s, L'\e0', ps); +.Ed +.Pp +Currently this fails with +.Er EILSEQ +which matches other implementations, but this is at odds with language +in the standard which suggests that passing +.Li L'\e0' +should unconditionally store a null byte and reset +.Fa ps +to the initial conversion state: +.Bd -offset indent +If +.Fa c8 +is a null character, a null byte is stored, preceded by any shift +sequence needed to restore the initial shift state; the resulting state +described is the initial conversion state. +.Ed +.Pp +However, it is unclear what else this should store besides a null +byte. +Should it discard the pending UTF-8 code unit sequence, or convert it +to something else and store that? Index: src/lib/libc/locale/c8rtomb.c diff -u /dev/null src/lib/libc/locale/c8rtomb.c:1.1 --- /dev/null Thu Aug 15 21:19:46 2024 +++ src/lib/libc/locale/c8rtomb.c Thu Aug 15 21:19:45 2024 @@ -0,0 +1,213 @@ +/* $NetBSD: c8rtomb.c,v 1.1 2024/08/15 21:19:45 riastradh Exp $ */ + +/*- + * Copyright (c) 2024 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * c8rtomb(s, c8, ps) + * + * Encode the Unicode UTF-8 code unit c8 into the multibyte buffer + * s under the current locale, using multibyte encoding state ps. + * + * If c8 is not the last byte of a UTF-8 scalar value sequence, no + * output will be produced, but c8 will be remembered; this must + * be followed by another call passing the following bytes. + * + * Return the number of bytes stored on success, or (size_t)-1 on + * error with errno set to EILSEQ. + * + * At most MB_CUR_MAX bytes will be stored. + * + * References: + * + * The Unicode Standard, Version 15.0 -- Core Specification, The + * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-8, + * p. 124. + * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150 + * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150 + * + * F. Yergeau, `UTF-8, a transformation format of ISO 10646', + * RFC 3629, Internet Engineering Task Force, November 2003. + * https://datatracker.ietf.org/doc/html/rfc3629 + */ + +#include <sys/cdefs.h> +__RCSID("$NetBSD: c8rtomb.c,v 1.1 2024/08/15 21:19:45 riastradh Exp $"); + +#include <assert.h> +#include <errno.h> +#include <limits.h> +#include <stdalign.h> +#include <stddef.h> +#include <stdint.h> +#include <uchar.h> + +#include "c32rtomb.h" + +struct c8rtombstate { + char32_t state_c32; /* 8-bit state and 24-bit buffer */ + mbstate_t mbs; +}; +__CTASSERT(offsetof(struct c8rtombstate, mbs) <= sizeof(mbstate_t)); +__CTASSERT(sizeof(struct c32rtombstate) <= sizeof(mbstate_t) - + offsetof(struct c8rtombstate, mbs)); +__CTASSERT(alignof(struct c8rtombstate) <= alignof(mbstate_t)); + +/* + * UTF-8 validation, inspired by Bjoern Hoermann's UTF-8 decoder at + * <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>, but reimplemented + * from scratch. + */ + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 96 + +typedef uint_fast8_t utf8_class_t; +typedef uint_fast8_t utf8_state_t; + +static uint8_t utf8_classtab[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 11,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 7,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, +}; + +static uint8_t utf8_statetab[] = { + 0,96,12,36,48,84,72,60,96,96,96,24, 96, 0,96,96,96,96,96,96, 0, 0,96,96, + 96,12,96,96,96,96,96,96,96,96,96,96, 96,12,96,96,96,96,96,96,12,12,96,96, + 96,96,96,96,96,96,96,96,12,12,96,96, 96,36,96,96,96,96,96,96,96,36,96,96, + 96,36,96,96,96,96,96,96,36,36,96,96, 96,96,96,96,96,96,96,96,36,96,96,96, + 96,96,96,96,96,96,96,96,96,96,96,96, +}; + +static utf8_state_t +utf8_decode_step(utf8_state_t state, char8_t c8, char32_t *pc32) +{ + const utf8_class_t class = utf8_classtab[c8]; + + *pc32 = (state == UTF8_ACCEPT + ? (c8 & (0xff >> class)) + : ((c8 & 0x3f) | (*pc32 << 6))); + + return utf8_statetab[state + class]; +} + +size_t +c8rtomb(char *restrict s, char8_t c8, mbstate_t *restrict ps) +{ + static mbstate_t psbuf; + char buf[MB_LEN_MAX]; + struct c8rtombstate *S; + utf8_state_t state; + char32_t c32; + + /* + * `If ps is a null pointer, each function uses its own + * internal mbstate_t object instead, which is initialized at + * program startup to the initial conversion state; the + * functions are not required to avoid data races with other + * calls to the same function in this case. The + * implementation behaves as if no library function calls + * these functions with a null pointer for ps.' + */ + if (ps == NULL) + ps = &psbuf; + + /* + * `If s is a null pointer, the c8rtomb function is equivalent + * to the call + * + * c8rtomb(buf, u8'\0', ps) + * + * where buf is an internal buffer. + */ + if (s == NULL) { + s = buf; + c8 = 0; /* XXX u8'\0' */ + } + + /* + * Open the private UTF-8 decoding state. + */ + S = (struct c8rtombstate *)ps; + +#if 0 + /* + * `If c8 is a null character, a null byte is stored, preceded + * by any shift sequence needed to restore the initial shift + * state; the resulting state described is the initial + * conversion state.' + * + * XXX But what else gets stored? Do we just discard any + * pending sequence, or do we convert it to something else, or + * what? + */ + if (c8 == u8'\0') { + memset(S->buf, 0, sizeof(S->buf)); + S->n = 0; + } +#endif + + /* + * Get the current state and buffer. + */ + __CTASSERT(UTF8_ACCEPT == 0); /* initial conversion state */ + state = __SHIFTOUT(S->state_c32, __BITS(31,24)); + c32 = __SHIFTOUT(S->state_c32, __BITS(23,0)); + + /* + * Feed the byte into the state machine to update the state. + */ + state = utf8_decode_step(state, c8, &c32); + switch (state) { + case UTF8_REJECT: + /* + * Invalid UTF-8. Fail with EILSEQ. + */ + errno = EILSEQ; + return (size_t)-1; + default: + /* + * Valid UTF-8 so far but incomplete. Update state and + * output nothing. + */ + S->state_c32 = __SHIFTIN(state, __BITS(31,24)) | + __SHIFTIN(c32, __BITS(23,0)); + return 0; + case UTF8_ACCEPT: + /* + * We have a scalar value. Clear the state and output + * the scalar value. + */ + __CTASSERT(UTF8_ACCEPT == 0); + S->state_c32 = 0; + return c32rtomb(s, c32, &S->mbs); + } +} Index: src/lib/libc/locale/mbrtoc8.3 diff -u /dev/null src/lib/libc/locale/mbrtoc8.3:1.1 --- /dev/null Thu Aug 15 21:19:46 2024 +++ src/lib/libc/locale/mbrtoc8.3 Thu Aug 15 21:19:45 2024 @@ -0,0 +1,307 @@ +.\" $NetBSD: mbrtoc8.3,v 1.1 2024/08/15 21:19:45 riastradh Exp $ +.\" +.\" Copyright (c) 2024 The NetBSD Foundation, Inc. +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS +.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS +.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.\" POSSIBILITY OF SUCH DAMAGE. +.\" +.Dd August 15, 2024 +.Dt MBRTOC8 3 +.Os +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh NAME +.Nm mbrtoc8 +.Nd Restartable multibyte to UTF-8 code unit conversion +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh LIBRARY +.Lb libc +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh SYNOPSIS +.In uchar.h +.Ft size_t +.Fn mbrtoc8 "char8_t * restrict pc8" \ +"const char * restrict s" \ +"size_t n" \ +"mbstate_t * restrict ps" +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh DESCRIPTION +The +.Nm +function attempts to decode a multibyte character sequence at +.Fa s +of up to +.Fa n +bytes in the current locale, and yield the content as UTF-8 code +units via the output parameter +.Fa pc8 . +.Fa pc8 +may be null, in which case no output is stored. +.Bl -bullet +.It +If the multibyte sequence at +.Fa s +is invalid or an error occurs in decoding, +.Nm +returns +.Li (size_t)-1 +and sets +.Xr errno 2 +to indicate the error. +.It +If the multibyte sequence at +.Fa s +is still incomplete after +.Fa n +bytes, including any previously processed input saved in +.Fa ps , +.Nm +saves its state in +.Fa ps +after all the input so far and returns +.Li "(size_t)-2". +.It +If +.Nm +finds the null scalar value at +.Fa s , +then it stores zero at +.Li * Ns Fa pc8 +and returns zero. +.It +If +.Nm +finds a nonnull scalar value in the US-ASCII range, i.e., a 7-bit +scalar value, then it stores the scalar value at +.Li * Ns Fa pc8 , +and returns the number of bytes it read from the input. +.It +If +.Nm +finds a scalar value outside the US-ASCII range, it: +.Bl -dash -compact +.It +stores the leading byte in the scalar value's UTF-8 encoding at +.Li * Ns Fa pc8 ; +.It +stores conversion state in +.Fa ps +to remember the rest of the pending scalar value; and +.It +returns the number of bytes it read from the input. +.El +.It +If +.Nm +had previously found a scalar value outside the US-ASCII range, then, +instead of any of the above options, it: +.Bl -dash -compact +.It +stores the next byte in the scalar value's UTF-8 encoding at +.Li * Ns Fa pc8 ; +.It +updates the conversion state in +.Fa ps +to consume this byte; and +.It +returns +.Li (size_t)-3 +to indicate that no bytes were consumed but a code unit was yielded +nevertheless. +.El +.El +.Pp +If +.Fa s +is a null pointer, the +.Nm +call is equivalent to: +.Bd -ragged -offset indent +.Fo mbrtoc8 +.Li NULL , +.Li \*q\*q , +.Li 1 , +.Fa ps +.Fc +.Ed +.Pp +This always returns zero, and has the effect of resetting +.Fa ps +to the initial conversion state, without writing to +.Fa pc8 , +even if it is nonnull. +.Pp +If +.Fa ps +is a null pointer, +.Nm +uses an internal +.Vt mbstate_t +object with static storage duration, distinct from all other +.Vt mbstate_t +objects (including those used by +.Xr mbrtoc16 3 , +.Xr mbrtoc32 3 , +.Xr c8rtomb 3 , +.Xr c16rtomb 3 , +and +.Xr c32rtomb 3 ) , +which is initialized at program startup to the initial conversion +state. +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh RETURN VALUES +The +.Nm +function returns: +.Bl -tag -width ".Li (size_t)-3" -offset indent +.It Li 0 +[null] +if within the next +.Fa n +bytes at +.Fa s +the first multibyte character is null. +.It Fa i +[code unit] +where +.Li 0 +\*(Le +.Fa i +\*(Le +.Fa n , +if either +.Fa ps +is in the initial conversion state or the previous call to +.Nm +with +.Fa ps +had not yielded an incomplete UTF-8 code unit, and within the first +.Fa i +bytes at +.Fa s +a Unicode scalar value was decoded. +.It Li (size_t)-3 +[continuation] +if the previous call to +.Nm +with +.Fa ps +had yielded an incomplete UTF-8 code unit for a Unicode scalar value +outside the US-ASCII range; no additional input is consumed in this +case. +.It Li (size_t)-2 +[incomplete] +if either +.Fa ps +is in the initial conversion state or the previous call to +.Nm +with +.Fa ps +had not yielded an incomplete UTF-8 code unit, and within the first +.Fa n +bytes at +.Fa s , +including any previously buffered input, no complete Unicode scalar +value could be decoded. +.It Li (size_t)-1 +[error] +if any encoding error was detected; +.Xr errno 2 +is set to reflect the error. +.El +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh EXAMPLES +Print the UTF-8 code units of a multibyte string in hexadecimal text: +.Bd -literal -offset indent +char *s = ...; +size_t n = ...; +mbstate_t mbs = {0}; /* initial conversion state */ + +while (n) { + char8_t c8; + size_t len; + + len = mbrtoc8(&c8, s, n, &mbs); + switch (len) { + case 0: /* null terminator */ + assert(c8 == '\e0'); + goto out; + default: /* consumed input and yielded a byte c8 */ + printf("0x%02hhx\en", c8); + break; + case (size_t)-3: /* yielded a pending byte c8 */ + printf("continue 0x%02hhx\en", c8); + break; + case (size_t)-2: /* incomplete */ + printf("incomplete\en"); + goto readmore; + case (size_t)-1: /* error */ + printf("error: %d\en", errno); + goto out; + } + s += len; + n -= len; +} +.Ed +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh ERRORS +.Bl -tag -width ".Bq Er EILSEQ" +.It Bq Er EILSEQ +The multibyte sequence cannot be decoded as a Unicode scalar value. +.It Bq Er EIO +An error occurred in loading the locale's character conversions. +.El +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh SEE ALSO +.Xr c8rtomb 3 , +.Xr c16rtomb 3 , +.Xr c32rtomb 3 , +.Xr mbrtoc16 3 , +.Xr mbrtoc32 3 , +.Xr uchar 3 +.Rs +.%B The Unicode Standard +.%O Version 15.0 \(em Core Specification +.%Q The Unicode Consortium +.%D September 2022 +.%U https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf +.Re +.Rs +.%A F. Yergeau +.%T UTF-8, a transformation format of ISO 10646 +.%R RFC 3629 +.%D November 2003 +.%I Internet Engineering Task Force +.%U https://datatracker.ietf.org/doc/html/rfc3629 +.Re +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.\" .Sh STANDARDS +.\" The +.\" .Nm +.\" function conforms to +.\" .St -isoC-2023 . +.\" .\" XXX PR misc/58600: man pages lack C17, C23, C++98, C++03, C++11, C++17, C++20, C++23 citation syntax +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh HISTORY +The +.Nm +function first appeared in +.Nx 11.0 . Index: src/lib/libc/locale/mbrtoc8.c diff -u /dev/null src/lib/libc/locale/mbrtoc8.c:1.1 --- /dev/null Thu Aug 15 21:19:46 2024 +++ src/lib/libc/locale/mbrtoc8.c Thu Aug 15 21:19:45 2024 @@ -0,0 +1,210 @@ +/* $NetBSD: mbrtoc8.c,v 1.1 2024/08/15 21:19:45 riastradh Exp $ */ + +/*- + * Copyright (c) 2024 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * mbrtoc16(&c16, s, n, ps) + * + * Decode a Unicode scalar value from up to n bytes out of the + * multibyte string s, using multibyte encoding state ps, and + * store the next code unit in the UTF-8 representation of that + * scalar value at c8. + * + * If the UTF-8 representation of that scalar value is multiple + * bytes long, mbrtoc8 will yield leading byte in one call that + * consumes input, and will yield the trailing bytes in subsequent + * calls without consuming any input and returning (size_t)-3 + * instead. + * + * Return the number of bytes consumed on success, or: + * + * - 0 if the code unit is NUL, or + * - (size_t)-3 if a trailing byte was returned without consuming + * any additional input, or + * - (size_t)-2 if the input is incomplete, or + * - (size_t)-1 on error with errno set to EILSEQ. + * + * In the case of incomplete input, the decoding state so far + * after processing s[0], s[1], ..., s[n - 1] is saved in ps, so + * subsequent calls to mbrtoc8 will pick up n bytes later into + * the input stream. + * + * References: + * + * The Unicode Standard, Version 15.0 -- Core Specification, The + * Unicode Consortium, Sec. 3.8 `Surrogates', p. 119. + * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 + * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 + * + * The Unicode Standard, Version 15.0 -- Core Specification, The + * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-16, + * p. 124. + * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150 + * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150 + * + * F. Yergeau, `UTF-8, a transformation format of ISO 10646', + * RFC 3629, Internet Engineering Task Force, November 2003. + * https://datatracker.ietf.org/doc/html/rfc3629 + */ + +#include <sys/cdefs.h> +__RCSID("$NetBSD: mbrtoc8.c,v 1.1 2024/08/15 21:19:45 riastradh Exp $"); + +#include <assert.h> +#include <errno.h> +#include <stdalign.h> +#include <stddef.h> +#include <uchar.h> + +#include "mbrtoc32.h" + +struct mbrtoc8state { + char8_t nleft; + char8_t buf[3]; + mbstate_t mbs; +}; +__CTASSERT(offsetof(struct mbrtoc8state, mbs) <= sizeof(mbstate_t)); +__CTASSERT(sizeof(struct mbrtoc32state) <= sizeof(mbstate_t) - + offsetof(struct mbrtoc8state, mbs)); +__CTASSERT(alignof(struct mbrtoc8state) <= alignof(mbstate_t)); + +size_t +mbrtoc8(char8_t *restrict pc8, const char *restrict s, size_t n, + mbstate_t *restrict ps) +{ + static mbstate_t psbuf; + struct mbrtoc8state *S; + char32_t c32; + size_t len; + + /* + * `If ps is a null pointer, each function uses its own + * internal mbstate_t object instead, which is initialized at + * program startup to the initial conversion state; the + * functions are not required to avoid data races with other + * calls to the same function in this case. The + * implementation behaves as if no library function calls + * these functions with a null pointer for ps.' + */ + if (ps == NULL) + ps = &psbuf; + + /* + * `If s is a null pointer, the mbrtoc8 function is equivalent + * to the call: + * + * mbrtoc8(NULL, "", 1, ps) + * + * In this case, the values of the parameters pc8 and n are + * ignored.' + */ + if (s == NULL) { + pc8 = NULL; + s = ""; + n = 1; + } + + /* + * Get the private conversion state. + */ + S = (struct mbrtoc8state *)ps; + + /* + * If there are pending trailing bytes, yield them and return + * (size_t)-3 to indicate that no bytes of input were consumed. + */ + if (S->nleft) { + if (pc8) + *pc8 = S->buf[sizeof(S->buf) - S->nleft]; + S->buf[sizeof(S->buf) - S->nleft] = 0; /* paranoia */ + S->nleft--; + return (size_t)-3; + } + + /* + * Consume the next scalar value. If no full scalar value can + * be obtained, stop here. + */ + len = mbrtoc32(&c32, s, n, &S->mbs); + switch (len) { + case 0: /* NUL */ + if (pc8) + *pc8 = 0; + return 0; + case (size_t)-2: /* still incomplete after n bytes */ + case (size_t)-1: /* error */ + return len; + default: /* consumed len bytes of input */ + break; + } + + /* + * We consumed a scalar value from the input. + * + * Encode it as UTF-8, yield the leading byte, and buffer the + * trailing bytes to yield later. + * + * Table 3-6: UTF-8 Bit Distribution + * Table 3-7: Well-Formed UTF-8 Byte Sequences + */ + switch (c32) { + case 0x00 ... 0x7f: + if (pc8) + *pc8 = c32; + _DIAGASSERT(S->nleft == 0); + break; + case 0x0080 ... 0x07ff: + if (pc8) + *pc8 = 0xc0 | __SHIFTOUT(c32, __BITS(10,6)); + S->buf[2] = 0x80 | __SHIFTOUT(c32, __BITS(5,0)); + S->nleft = 1; + break; + case 0x0800 ... 0xffff: + if (pc8) + *pc8 = 0xe0 | __SHIFTOUT(c32, __BITS(15,12)); + S->buf[1] = 0x80 | __SHIFTOUT(c32, __BITS(11,6)); + S->buf[2] = 0x80 | __SHIFTOUT(c32, __BITS(5,0)); + S->nleft = 2; + break; + case 0x10000 ... 0x10ffff: + if (pc8) + *pc8 = 0xf0 | __SHIFTOUT(c32, __BITS(20,18)); + S->buf[0] = 0x80 | __SHIFTOUT(c32, __BITS(17,12)); + S->buf[1] = 0x80 | __SHIFTOUT(c32, __BITS(11,6)); + S->buf[2] = 0x80 | __SHIFTOUT(c32, __BITS(5,0)); + S->nleft = 3; + break; + default: + errno = EILSEQ; + return (size_t)-1; + } + + /* + * Return the number of bytes consumed from the input. + */ + return len; +} Index: src/tests/lib/libc/locale/t_c8rtomb.c diff -u /dev/null src/tests/lib/libc/locale/t_c8rtomb.c:1.1 --- /dev/null Thu Aug 15 21:19:46 2024 +++ src/tests/lib/libc/locale/t_c8rtomb.c Thu Aug 15 21:19:45 2024 @@ -0,0 +1,205 @@ +/* $NetBSD: t_c8rtomb.c,v 1.1 2024/08/15 21:19:45 riastradh Exp $ */ + +/*- + * Copyright (c) 2002 Tim J. Robbins + * All rights reserved. + * + * Copyright (c) 2013 Ed Schouten <e...@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Test program for c8rtomb() as specified by C23. + */ + +#include <sys/cdefs.h> +__RCSID("$NetBSD: t_c8rtomb.c,v 1.1 2024/08/15 21:19:45 riastradh Exp $"); + +#include <errno.h> +#include <limits.h> +#include <locale.h> +#include <stdio.h> +#include <string.h> +#include <uchar.h> + +#include <atf-c.h> + +static void +require_lc_ctype(const char *locale_name) +{ + char *lc_ctype_set; + + lc_ctype_set = setlocale(LC_CTYPE, locale_name); + if (lc_ctype_set == NULL) + atf_tc_fail("setlocale(LC_CTYPE, \"%s\") failed; errno=%d", + locale_name, errno); + + ATF_REQUIRE_EQ_MSG(strcmp(lc_ctype_set, locale_name), 0, + "lc_ctype_set=%s locale_name=%s", lc_ctype_set, locale_name); +} + +static mbstate_t s; +static char buf[MB_LEN_MAX + 1]; + +ATF_TC_WITHOUT_HEAD(c8rtomb_c_locale_test); +ATF_TC_BODY(c8rtomb_c_locale_test, tc) +{ + size_t n; + + require_lc_ctype("C"); + + /* + * If the buffer argument is NULL, c8 is implicitly 0, + * c8rtomb() resets its internal state. + */ + ATF_CHECK_EQ_MSG((n = c8rtomb(NULL, '\0', NULL)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(NULL, 0x80, NULL)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(NULL, 0xc0, NULL)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(NULL, 0xe0, NULL)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(NULL, 0xf0, NULL)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(NULL, 0xf8, NULL)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(NULL, 0xfc, NULL)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(NULL, 0xfe, NULL)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(NULL, 0xff, NULL)), 1, "n=%zu", n); + + + /* Null wide character. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0, &s)), 1, "n=%zu", n); + ATF_CHECK_MSG(((unsigned char)buf[0] == 0 && + (unsigned char)buf[1] == 0xcc), + "buf=[%02x %02x]", buf[0], buf[1]); + + /* Latin letter A, internal state. */ + ATF_CHECK_EQ_MSG((n = c8rtomb(NULL, '\0', NULL)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(NULL, 'A', NULL)), 1, "n=%zu", n); + + /* Latin letter A. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 'A', &s)), 1, "n=%zu", n); + ATF_CHECK_MSG(((unsigned char)buf[0] == 'A' && + (unsigned char)buf[1] == 0xcc), + "buf=[%02x %02x]", buf[0], buf[1]); + + /* Unicode character 'Pile of poo'. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xf0, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0x9f, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0x92, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xa9, &s)), (size_t)-1, + "n=%zu", n); + ATF_CHECK_EQ_MSG(errno, EILSEQ, "errno=%d", errno); + ATF_CHECK_EQ_MSG((unsigned char)buf[0], 0xcc, "buf=[%02x]", buf[0]); +} + +ATF_TC_WITHOUT_HEAD(c8rtomb_iso_8859_1_test); +ATF_TC_BODY(c8rtomb_iso_8859_1_test, tc) +{ + size_t n; + + require_lc_ctype("en_US.ISO8859-1"); + + /* Unicode character 'Euro sign'. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xe2, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0x82, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xac, &s)), (size_t)-1, + "n=%zu", n); + ATF_CHECK_EQ_MSG(errno, EILSEQ, "errno=%d", errno); + ATF_CHECK_EQ_MSG((unsigned char)buf[0], 0xcc, "buf=[%02x]", buf[0]); +} + +ATF_TC_WITHOUT_HEAD(c8rtomb_iso_8859_15_test); +ATF_TC_BODY(c8rtomb_iso_8859_15_test, tc) +{ + size_t n; + + require_lc_ctype("en_US.ISO8859-15"); + + /* Unicode character 'Euro sign'. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xe2, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0x82, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xac, &s)), 1, "n=%zu", n); + ATF_CHECK_MSG(((unsigned char)buf[0] == 0xa4 && + (unsigned char)buf[1] == 0xcc), + "buf=[%02x %02x]", buf[0], buf[1]); +} + +ATF_TC_WITHOUT_HEAD(c8rtomb_utf_8_test); +ATF_TC_BODY(c8rtomb_utf_8_test, tc) +{ + size_t n; + + require_lc_ctype("en_US.UTF-8"); + + /* Unicode character 'Pile of poo'. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xf0, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0x9f, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0x92, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xa9, &s)), 4, "n=%zu", n); + ATF_CHECK_MSG(((unsigned char)buf[0] == 0xf0 && + (unsigned char)buf[1] == 0x9f && + (unsigned char)buf[2] == 0x92 && + (unsigned char)buf[3] == 0xa9 && + (unsigned char)buf[4] == 0xcc), + "buf=[%02x %02x %02x %02x %02x]", + buf[0], buf[1], buf[2], buf[3], buf[4]); + + /* Invalid code; 'Pile of poo' without the last byte. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xf0, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0x9f, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0x92, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 'A', &s)), (size_t)-1, + "n=%zu", n); + ATF_CHECK_EQ_MSG(errno, EILSEQ, "errno=%d", errno); + ATF_CHECK_EQ_MSG((unsigned char)buf[0], 0xcc, "buf=[%02x]", buf[0]); + + /* Invalid code; 'Pile of poo' without the first byte. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0x9f, &s)), (size_t)-1, + "n=%zu", n); + ATF_CHECK_EQ_MSG(errno, EILSEQ, "errno=%d", errno); + ATF_CHECK_EQ_MSG((unsigned char)buf[0], 0xcc, "buf=[%02x]", buf[0]); +} + +ATF_TP_ADD_TCS(tp) +{ + + ATF_TP_ADD_TC(tp, c8rtomb_c_locale_test); + ATF_TP_ADD_TC(tp, c8rtomb_iso_8859_1_test); + ATF_TP_ADD_TC(tp, c8rtomb_iso_8859_15_test); + ATF_TP_ADD_TC(tp, c8rtomb_utf_8_test); + + return (atf_no_error()); +} Index: src/tests/lib/libc/locale/t_mbrtoc8.c diff -u /dev/null src/tests/lib/libc/locale/t_mbrtoc8.c:1.1 --- /dev/null Thu Aug 15 21:19:46 2024 +++ src/tests/lib/libc/locale/t_mbrtoc8.c Thu Aug 15 21:19:45 2024 @@ -0,0 +1,268 @@ +/* $NetBSD: t_mbrtoc8.c,v 1.1 2024/08/15 21:19:45 riastradh Exp $ */ + +/*- + * Copyright (c) 2002 Tim J. Robbins + * All rights reserved. + * + * Copyright (c) 2013 Ed Schouten <e...@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Test program for mbrtoc8() as specified by C23. + */ + +#include <sys/cdefs.h> +__RCSID("$NetBSD: t_mbrtoc8.c,v 1.1 2024/08/15 21:19:45 riastradh Exp $"); + +#include <errno.h> +#include <inttypes.h> +#include <limits.h> +#include <locale.h> +#include <string.h> +#include <uchar.h> + +#include <atf-c.h> + +static void +require_lc_ctype(const char *locale_name) +{ + char *lc_ctype_set; + + lc_ctype_set = setlocale(LC_CTYPE, locale_name); + if (lc_ctype_set == NULL) + atf_tc_fail("setlocale(LC_CTYPE, \"%s\") failed; errno=%d", + locale_name, errno); + + ATF_REQUIRE_EQ_MSG(strcmp(lc_ctype_set, locale_name), 0, + "lc_ctype_set=%s locale_name=%s", lc_ctype_set, locale_name); +} + +static mbstate_t s; +static char8_t c8; + +ATF_TC_WITHOUT_HEAD(mbrtoc8_c_locale_test); +ATF_TC_BODY(mbrtoc8_c_locale_test, tc) +{ + size_t n; + + require_lc_ctype("C"); + + /* Null wide character, internal state. */ + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 1, NULL)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8); + + /* Null wide character. */ + memset(&s, 0, sizeof(s)); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 1, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8); + + /* Latin letter A, internal state. */ + ATF_CHECK_EQ_MSG((n = mbrtoc8(NULL, 0, 0, NULL)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "A", 1, NULL)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 'A', "c8=0x%"PRIx8" 'A'=0x%"PRIx8, + (uint8_t)c8, (uint8_t)'A'); + + /* Latin letter A. */ + memset(&s, 0, sizeof(s)); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "A", 1, &s)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 'A', "c8=0x%"PRIx8" 'A'=0x%"PRIx8, + (uint8_t)c8, (uint8_t)'A'); + + /* Incomplete character sequence. */ + c8 = 'z'; + memset(&s, 0, sizeof(s)); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-2, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 'z', "c8=0x%"PRIx8" 'z'=0x%"PRIx8, + (uint8_t)c8, (uint8_t)'z'); + + /* Check that mbrtoc8() doesn't access the buffer when n == 0. */ + c8 = 'z'; + memset(&s, 0, sizeof(s)); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-2, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 'z', "c8=0x%"PRIx8" 'z'=0x%"PRIx8, + (uint8_t)c8, (uint8_t)'z'); + + /* Check that mbrtoc8() doesn't read ahead too aggressively. */ + memset(&s, 0, sizeof(s)); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "AB", 2, &s)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 'A', "c8=0x%"PRIx8" 'A'=0x%"PRIx8, + (uint8_t)c8, (uint8_t)'A'); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "C", 1, &s)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 'C', "c8=0x%"PRIx8" 'C'=0x%"PRIx8, + (uint8_t)c8, (uint8_t)'C'); + +} + +ATF_TC_WITHOUT_HEAD(mbrtoc8_iso_8859_1_test); +ATF_TC_BODY(mbrtoc8_iso_8859_1_test, tc) +{ + size_t n; + + require_lc_ctype("en_US.ISO8859-1"); + + /* Currency sign. */ + memset(&s, 0, sizeof(s)); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\xa4", 1, &s)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0xc2, "c8=0x%"PRIx8, (uint8_t)c8); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0xa4, "c8=0x%"PRIx8, (uint8_t)c8); +} + +ATF_TC_WITHOUT_HEAD(mbrtoc8_iso_8859_15_test); +ATF_TC_BODY(mbrtoc8_iso_8859_15_test, tc) +{ + size_t n; + + require_lc_ctype("en_US.ISO8859-15"); + + /* Euro sign. */ + memset(&s, 0, sizeof(s)); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\xa4", 1, &s)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0xe2, "c8=0x%"PRIx8, (uint8_t)c8); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0x82, "c8=0x%"PRIx8, (uint8_t)c8); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0xac, "c8=0x%"PRIx8, (uint8_t)c8); +} + +ATF_TC_WITHOUT_HEAD(mbrtoc8_utf_8_test); +ATF_TC_BODY(mbrtoc8_utf_8_test, tc) +{ + size_t n; + + require_lc_ctype("en_US.UTF-8"); + + /* Null wide character, internal state. */ + ATF_CHECK_EQ_MSG((n = mbrtoc8(NULL, 0, 0, NULL)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 1, NULL)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8); + + /* Null wide character. */ + memset(&s, 0, sizeof(s)); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 1, &s)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8); + + /* Latin letter A, internal state. */ + ATF_CHECK_EQ_MSG((n = mbrtoc8(NULL, 0, 0, NULL)), 0, "n=%zu", n); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "A", 1, NULL)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 'A', "c8=0x%"PRIx8" 'A'=0x%"PRIx8, + (uint8_t)c8, (uint8_t)'A'); + + /* Latin letter A. */ + memset(&s, 0, sizeof(s)); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "A", 1, &s)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 'A', "c8=0x%"PRIx8" 'A'=0x%"PRIx8, + (uint8_t)c8, (uint8_t)'A'); + + /* Incomplete character sequence (zero length). */ + c8 = 'z'; + memset(&s, 0, sizeof(s)); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-2, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 'z', "c8=0x%"PRIx8" 'z'=0x%"PRIx8, + (uint8_t)c8, (uint8_t)'z'); + + /* Incomplete character sequence (truncated double-byte). */ + memset(&s, 0, sizeof(s)); + c8 = 0; + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\xc3", 1, &s)), (size_t)-2, + "n=%zu", n); + + /* Same as above, but complete. */ + memset(&s, 0, sizeof(s)); + c8 = 0; + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\xc3\x84", 2, &s)), 2, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0xc3, "c8=0x%"PRIx8, (uint8_t)c8); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0x84, "c8=0x%"PRIx8, (uint8_t)c8); + + /* Test restarting behaviour. */ + memset(&s, 0, sizeof(s)); + c8 = 0; + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\xc3", 1, &s)), (size_t)-2, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\xb7", 1, &s)), 1, "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0xc3, "c8=0x%"PRIx8, (uint8_t)c8); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0xb7, "c8=0x%"PRIx8, (uint8_t)c8); + + /* Four-byte sequence. */ + memset(&s, 0, sizeof(s)); + c8 = 0; + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\xf0\x9f\x92\xa9", 4, &s)), 4, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0xf0, "c8=0x%"PRIx8, (uint8_t)c8); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0x9f, "c8=0x%"PRIx8, (uint8_t)c8); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0x92, "c8=0x%"PRIx8, (uint8_t)c8); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0xa9, "c8=0x%"PRIx8, (uint8_t)c8); + + /* Letter e with acute, precomposed. */ + memset(&s, 0, sizeof(s)); + c8 = 0; + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\xc3\xa9", 2, &s)), 2, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0xc3, "c8=0x%"PRIx8, (uint8_t)c8); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0xa9, "c8=0x%"PRIx8, (uint8_t)c8); + + /* Letter e with acute, combined. */ + memset(&s, 0, sizeof(s)); + c8 = 0; + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x65\xcc\x81", 3, &s)), 1, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0x65, "c8=0x%"PRIx8, (uint8_t)c8); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\xcc\x81", 2, &s)), 2, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0xcc, "c8=0x%"PRIx8, (uint8_t)c8); + ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3, + "n=%zu", n); + ATF_CHECK_EQ_MSG(c8, 0x81, "c8=0x%"PRIx8, (uint8_t)c8); +} + +ATF_TP_ADD_TCS(tp) +{ + + ATF_TP_ADD_TC(tp, mbrtoc8_c_locale_test); + ATF_TP_ADD_TC(tp, mbrtoc8_iso_8859_1_test); + ATF_TP_ADD_TC(tp, mbrtoc8_iso_8859_15_test); + ATF_TP_ADD_TC(tp, mbrtoc8_utf_8_test); + + return (atf_no_error()); +}