ISO C 23 specifies a new type, to be defined by <uchar.h>. This patch adds it.
2023-03-27 Bruno Haible <br...@clisp.org> uchar: ISO C 23: Define char8_t. * lib/uchar.in.h (char8_t): New type or macro. * m4/uchar_h.m4 (gl_TYPE_CHAR8_T): New macro. (gl_UCHAR_H): Invoke it. Set CXX_HAS_CHAR8_TYPE. * modules/uchar (Makefile.am): Substitute CXX_HAS_CHAR8_TYPE, GNULIBHEADERS_OVERRIDE_CHAR8_T. * tests/test-uchar.c: Add tests for char8_t. diff --git a/lib/uchar.in.h b/lib/uchar.in.h index 115ae1e84b..4d5f07fcce 100644 --- a/lib/uchar.in.h +++ b/lib/uchar.in.h @@ -17,7 +17,7 @@ /* Written by Bruno Haible <br...@clisp.org>, 2019. */ /* - * ISO C 11 <uchar.h> for platforms that lack it. + * ISO C 23 <uchar.h> for platforms that lack it. */ #ifndef _@GUARD_PREFIX@_UCHAR_H @@ -58,11 +58,26 @@ /* The definitions of _GL_FUNCDECL_RPL etc. are copied here. */ +#if !(@HAVE_UCHAR_H@ || (defined __cplusplus && @CXX_HAS_CHAR8_TYPE@)) + +/* An 8-bit variant of wchar_t. + Note: This type is only mandated by ISO C 23 or newer, and it does + denote UTF-8 units. */ +typedef unsigned char char8_t; + +#elif @GNULIBHEADERS_OVERRIDE_CHAR8_T@ + +typedef unsigned char gl_char8_t; +# define char8_t gl_char8_t + +#endif + #if !(@HAVE_UCHAR_H@ || (defined __cplusplus && @CXX_HAS_UCHAR_TYPES@)) /* A 16-bit variant of wchar_t. - Note: This type does *NOT* denote UTF-16 units. (Only on platforms - on which __STDC_UTF_16__ is defined.) */ + Note: This type is only mandated by ISO C 11 or newer. In ISO C 23 + and newer, it denotes UTF-16 units; in older versions of ISO C it did + so only on platforms on which __STDC_UTF_16__ was defined. */ typedef uint_least16_t char16_t; #elif @GNULIBHEADERS_OVERRIDE_CHAR16_T@ @@ -75,8 +90,9 @@ typedef uint_least16_t gl_char16_t; #if !(@HAVE_UCHAR_H@ || (defined __cplusplus && @CXX_HAS_UCHAR_TYPES@)) /* A 32-bit variant of wchar_t. - Note: This type does *NOT* denote UTF-32 code points. (Only on platforms - on which __STDC_UTF_32__ is defined.) */ + Note: This type is only mandated by ISO C 11 or newer. In ISO C 23 + and newer, it denotes UTF-32 code points; in older versions of ISO C + it did so only on platforms on which __STDC_UTF_32__ was defined. */ typedef uint_least32_t char32_t; #elif @GNULIBHEADERS_OVERRIDE_CHAR32_T@ diff --git a/m4/uchar_h.m4 b/m4/uchar_h.m4 index 2d1869a293..6df3056b32 100644 --- a/m4/uchar_h.m4 +++ b/m4/uchar_h.m4 @@ -1,4 +1,4 @@ -# uchar_h.m4 serial 20 +# uchar_h.m4 serial 21 dnl Copyright (C) 2019-2023 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, @@ -19,6 +19,7 @@ AC_DEFUN_ONCE([gl_UCHAR_H] fi AC_SUBST([HAVE_UCHAR_H]) + gl_TYPE_CHAR8_T gl_TYPE_CHAR16_T gl_TYPE_CHAR32_T @@ -26,6 +27,7 @@ AC_DEFUN_ONCE([gl_UCHAR_H] dnl on some platforms (e.g. OpenBSD 6.7), and as types defined by many dnl header files (<limits.h>, <stddef.h>, <stdint.h>, <stdio.h>, <stdlib.h> dnl and others) on some platforms (e.g. Mac OS X 10.13). + dnl The same thing may also happen for 'char8_t'; so, be prepared for it. m4_ifdef([gl_ANSI_CXX], [AC_REQUIRE([gl_ANSI_CXX])]) CXX_HAS_UCHAR_TYPES=0 if test $HAVE_UCHAR_H = 0; then @@ -53,6 +55,31 @@ AC_DEFUN_ONCE([gl_UCHAR_H] fi fi AC_SUBST([CXX_HAS_UCHAR_TYPES]) + CXX_HAS_CHAR8_TYPE=0 + if test $HAVE_UCHAR_H = 0; then + if test "$CXX" != no; then + AC_CACHE_CHECK([whether the C++ compiler predefines the char8_t types], + [gl_cv_cxx_has_char8_type], + [dnl We can't use AC_LANG_PUSH([C++]) and AC_LANG_POP([C++]) here, due to + dnl an autoconf bug <https://savannah.gnu.org/support/?110294>. + cat > conftest.cpp <<\EOF +#include <stddef.h> +char8_t a; +EOF + gl_command="$CXX $CXXFLAGS $CPPFLAGS -c conftest.cpp" + if AC_TRY_EVAL([gl_command]); then + gl_cv_cxx_has_char8_type=yes + else + gl_cv_cxx_has_char8_type=no + fi + rm -fr conftest* + ]) + if test $gl_cv_cxx_has_char8_type = yes; then + CXX_HAS_CHAR8_TYPE=1 + fi + fi + fi + AC_SUBST([CXX_HAS_CHAR8_TYPE]) dnl Test whether a 'char32_t' can hold more characters than a 'wchar_t'. gl_STDINT_BITSIZEOF([wchar_t], [gl_STDINT_INCLUDES]) @@ -71,6 +98,28 @@ AC_DEFUN_ONCE([gl_UCHAR_H] ]], [c32rtomb mbrtoc32]) ]) +AC_DEFUN_ONCE([gl_TYPE_CHAR8_T], +[ + dnl Determine whether gnulib's <uchar.h> would, if present, override char8_t. + AC_CACHE_CHECK([whether char8_t is correctly defined], + [gl_cv_type_char8_t_works], + [AC_COMPILE_IFELSE( + [AC_LANG_PROGRAM([[ + #include <uchar.h> + int verify[(char8_t)(-1) >= 0 && sizeof (char8_t) == sizeof (unsigned char) ? 1 : -1]; + ]]) + ], + [gl_cv_type_char8_t_works=yes], + [gl_cv_type_char8_t_works=no]) + ]) + if test $gl_cv_type_char8_t_works = no; then + GNULIBHEADERS_OVERRIDE_CHAR8_T=1 + else + GNULIBHEADERS_OVERRIDE_CHAR8_T=0 + fi + AC_SUBST([GNULIBHEADERS_OVERRIDE_CHAR8_T]) +]) + dnl On Haiku 2020, char16_t and char32_t are incorrectly defined. dnl See <https://dev.haiku-os.org/ticket/15990>. AC_DEFUN_ONCE([gl_TYPE_CHAR16_T], diff --git a/modules/uchar b/modules/uchar index 2c947ed243..8cf4cfb5cf 100644 --- a/modules/uchar +++ b/modules/uchar @@ -30,8 +30,10 @@ uchar.h: uchar.in.h $(top_builddir)/config.status $(CXXDEFS_H) -e 's|@''PRAGMA_SYSTEM_HEADER''@|@PRAGMA_SYSTEM_HEADER@|g' \ -e 's|@''PRAGMA_COLUMNS''@|@PRAGMA_COLUMNS@|g' \ -e 's|@''NEXT_UCHAR_H''@|$(NEXT_UCHAR_H)|g' \ + -e 's|@''CXX_HAS_CHAR8_TYPE''@|$(CXX_HAS_CHAR8_TYPE)|g' \ -e 's|@''CXX_HAS_UCHAR_TYPES''@|$(CXX_HAS_UCHAR_TYPES)|g' \ -e 's|@''SMALL_WCHAR_T''@|$(SMALL_WCHAR_T)|g' \ + -e 's|@''GNULIBHEADERS_OVERRIDE_CHAR8_T''@|$(GNULIBHEADERS_OVERRIDE_CHAR8_T)|g' \ -e 's|@''GNULIBHEADERS_OVERRIDE_CHAR16_T''@|$(GNULIBHEADERS_OVERRIDE_CHAR16_T)|g' \ -e 's|@''GNULIBHEADERS_OVERRIDE_CHAR32_T''@|$(GNULIBHEADERS_OVERRIDE_CHAR32_T)|g' \ -e 's/@''GNULIB_BTOC32''@/$(GNULIB_BTOC32)/g' \ diff --git a/tests/test-uchar.c b/tests/test-uchar.c index 0d5b7d77eb..38c5f2538e 100644 --- a/tests/test-uchar.c +++ b/tests/test-uchar.c @@ -23,15 +23,23 @@ /* Check that the types are defined. */ mbstate_t a = { 0 }; size_t b = 5; -char16_t c = 'x'; -char32_t d = 'y'; +char8_t c = 'x'; +char16_t d = 'y'; +char32_t e = 'z'; -/* Check that char16_t and char32_t are unsigned types. */ +/* Check that char8_t, char16_t, and char32_t are unsigned types. */ +static_assert ((char8_t)(-1) >= 0); static_assert ((char16_t)(-1) >= 0); #if !defined __HP_cc static_assert ((char32_t)(-1) >= 0); #endif +/* Check that char8_t is at least 8 bits wide. */ +static_assert ((char8_t)0xFF != (char8_t)0x7F); + +/* Check that char16_t is at least 16 bits wide. */ +static_assert ((char16_t)0xFFFF != (char16_t)0x7FFF); + /* Check that char32_t is at least 31 bits wide. */ static_assert ((char32_t)0x7FFFFFFF != (char32_t)0x3FFFFFFF);