Hi!

The N2653 paper contains:
"The proposed change to the type of UTF-8 string literals impacts backward
compatibility as described in the following sections. Implementors are
encouraged to offer options to disable char8_t support when necessary to
preserve compatibility with C17."
While we do have such an option for C++ (where we default to -fchar8_t
for C++20 and later, and to -fno-char8_t otherwise, but let the user
override it), for C we just use that option under the hood (similarly
set flag_char8_t to true for C23 and to false otherwise) but don't actually
allow the users to tweak that, which can help with incremental porting,
either allow -std=c17 -fchar8_t or -std=c23 -fno-char8_t.

The following patch enables the option also for C/ObjC.

Ok for trunk if it passes bootstrap/regtest?

2024-08-14  Jakub Jelinek  <ja...@redhat.com>

gcc/
        * doc/invoke.texi (-fchar8_t): Move to C section from C++,
        document behavior for both C and C++.
gcc/c-family/
        * c.opt (fchar8_t): Also enable for C and ObjC.
        * c.opt.urls: Regenerate.
gcc/testsuite/
        * gcc.dg/c17-utf8str-type-2.c: New test.
        * gcc.dg/c23-utf8str-type-2.c: New test.
        * gcc.dg/c23-utf8char-4.c: New test.

--- gcc/doc/invoke.texi.jj      2024-08-12 10:49:12.521610231 +0200
+++ gcc/doc/invoke.texi 2024-08-14 16:15:06.824421678 +0200
@@ -208,12 +208,12 @@ in the following sections.
 -fpermitted-flt-eval-methods=@var{standard}
 -fplan9-extensions  -fsigned-bitfields  -funsigned-bitfields
 -fsigned-char  -funsigned-char  -fstrict-flex-arrays[=@var{n}]
--fsso-struct=@var{endianness}}
+-fsso-struct=@var{endianness}  -fchar8_t}
 
 @item C++ Language Options
 @xref{C++ Dialect Options,,Options Controlling C++ Dialect}.
 @gccoptlist{-fabi-version=@var{n}  -fno-access-control
--faligned-new=@var{n}  -fargs-in-order=@var{n}  -fchar8_t  -fcheck-new
+-faligned-new=@var{n}  -fargs-in-order=@var{n}  -fcheck-new
 -fconstexpr-depth=@var{n}  -fconstexpr-cache-depth=@var{n}
 -fconstexpr-loop-limit=@var{n}  -fconstexpr-ops-limit=@var{n}
 -fno-elide-constructors
@@ -3013,6 +3013,62 @@ the target (the default).  This option i
 @strong{Warning:} the @option{-fsso-struct} switch causes GCC to generate
 code that is not binary compatible with code generated without it if the
 specified endianness is not the native endianness of the target.
+
+@opindex fchar8_t
+@opindex fno-char8_t
+@item -fchar8_t
+@itemx -fno-char8_t
+Enable support for @code{char8_t} as adopted for C++20 and C23.  This includes
+the addition of a new @code{char8_t} fundamental type (for C++ only), changes 
to the
+types of UTF-8 string and character literals, and for C++ only also new 
signatures for
+user-defined literals, associated standard library updates, and new
+@code{__cpp_char8_t} and @code{__cpp_lib_char8_t} feature test macros.
+For C @code{char8_t} is a typedef to @code{unsigned char} in
+@code{<uchar.h>} header.
+
+For C++ this option enables functions to be overloaded for ordinary and UTF-8
+strings:
+
+@smallexample
+int f(const char *);    // #1
+int f(const char8_t *); // #2
+int v1 = f("text");     // Calls #1
+int v2 = f(u8"text");   // Calls #2
+@end smallexample
+
+@noindent
+and introduces new signatures for user-defined literals:
+
+@smallexample
+int operator""_udl1(char8_t);
+int v3 = u8'x'_udl1;
+int operator""_udl2(const char8_t*, std::size_t);
+int v4 = u8"text"_udl2;
+template<typename T, T...> int operator""_udl3();
+int v5 = u8"text"_udl3;
+@end smallexample
+
+@noindent
+The change to the types of UTF-8 string and character literals
+introduces incompatibilities with ISO C++11 and later standards.  For example,
+the following code is well-formed under ISO C++11, but is ill-formed when
+@option{-fchar8_t} is specified.
+
+@smallexample
+const char *cp = u8"xx";// error: invalid conversion from
+                        //        `const char8_t*' to `const char*'
+int f(const char*);
+int v = f(u8"xx");      // error: invalid conversion from
+                        //        `const char8_t*' to `const char*'
+#ifdef __cplusplus
+std::string s@{u8"xx"@};  // error: no matching function for call to
+                        //        `std::basic_string<char>::basic_string()'
+using namespace std::literals;
+s = u8"xx"s;            // error: conversion from
+                        //        `basic_string<char8_t>' to non-scalar
+                        //        type `basic_string<char>' requested
+#endif
+@end smallexample
 @end table
 
 @node C++ Dialect Options
@@ -3157,58 +3213,6 @@ but few users will need to override the
 
 This flag is enabled by default for @option{-std=c++17}.
 
-@opindex fchar8_t
-@opindex fno-char8_t
-@item -fchar8_t
-@itemx -fno-char8_t
-Enable support for @code{char8_t} as adopted for C++20.  This includes
-the addition of a new @code{char8_t} fundamental type, changes to the
-types of UTF-8 string and character literals, new signatures for
-user-defined literals, associated standard library updates, and new
-@code{__cpp_char8_t} and @code{__cpp_lib_char8_t} feature test macros.
-
-This option enables functions to be overloaded for ordinary and UTF-8
-strings:
-
-@smallexample
-int f(const char *);    // #1
-int f(const char8_t *); // #2
-int v1 = f("text");     // Calls #1
-int v2 = f(u8"text");   // Calls #2
-@end smallexample
-
-@noindent
-and introduces new signatures for user-defined literals:
-
-@smallexample
-int operator""_udl1(char8_t);
-int v3 = u8'x'_udl1;
-int operator""_udl2(const char8_t*, std::size_t);
-int v4 = u8"text"_udl2;
-template<typename T, T...> int operator""_udl3();
-int v5 = u8"text"_udl3;
-@end smallexample
-
-@noindent
-The change to the types of UTF-8 string and character literals introduces
-incompatibilities with ISO C++11 and later standards.  For example, the
-following code is well-formed under ISO C++11, but is ill-formed when
-@option{-fchar8_t} is specified.
-
-@smallexample
-const char *cp = u8"xx";// error: invalid conversion from
-                        //        `const char8_t*' to `const char*'
-int f(const char*);
-auto v = f(u8"xx");     // error: invalid conversion from
-                        //        `const char8_t*' to `const char*'
-std::string s@{u8"xx"@};  // error: no matching function for call to
-                        //        `std::basic_string<char>::basic_string()'
-using namespace std::literals;
-s = u8"xx"s;            // error: conversion from
-                        //        `basic_string<char8_t>' to non-scalar
-                        //        type `basic_string<char>' requested
-@end smallexample
-
 @opindex fcheck-new
 @item -fcheck-new
 Check that the pointer returned by @code{operator new} is non-null
--- gcc/c-family/c.opt.jj       2024-08-07 09:38:00.212815781 +0200
+++ gcc/c-family/c.opt  2024-08-14 15:40:04.662759767 +0200
@@ -1633,9 +1633,9 @@ C ObjC C++ ObjC++
 Where shorter, use canonicalized paths to systems headers.
 
 fchar8_t
-C++ ObjC++ Var(flag_char8_t) Init(-1)
-Enable the char8_t fundamental type and use it as the type for UTF-8 string
-and character literals.
+C ObjC C++ ObjC++ Var(flag_char8_t) Init(-1)
+Enable the char8_t fundamental type (for C++) and use it (or for C
+unsigned char) as the type for UTF-8 string and character literals.
 
 fcheck-pointer-bounds
 C ObjC C++ ObjC++ LTO WarnRemoved
--- gcc/c-family/c.opt.urls.jj  2024-08-12 10:49:12.068616100 +0200
+++ gcc/c-family/c.opt.urls     2024-08-14 15:53:29.015934639 +0200
@@ -985,7 +985,7 @@ fcanonical-system-headers
 UrlSuffix(gcc/Preprocessor-Options.html#index-fno-canonical-system-headers)
 
 fchar8_t
-UrlSuffix(gcc/C_002b_002b-Dialect-Options.html#index-fchar8_005ft)
+UrlSuffix(gcc/C-Dialect-Options.html#index-fchar8_005ft)
 
 fconcepts
 UrlSuffix(gcc/C_002b_002b-Dialect-Options.html#index-fconcepts)
--- gcc/testsuite/gcc.dg/c17-utf8str-type-2.c.jj        2024-08-14 
16:00:29.135894662 +0200
+++ gcc/testsuite/gcc.dg/c17-utf8str-type-2.c   2024-08-14 16:00:56.320568541 
+0200
@@ -0,0 +1,6 @@
+/* Test C17 UTF-8 string literal type with -fchar8_t option.  */
+/* { dg-do compile } */
+/* { dg-options "-std=c17 -fchar8_t" } */
+
+_Static_assert (_Generic (u8"text", unsigned char*: 1, default: 2) == 1, 
"UTF-8 string literals have an unexpected type");
+_Static_assert (_Generic (u8"x"[0], unsigned char:  1, default: 2) == 1, 
"UTF-8 string literal elements have an unexpected type");
--- gcc/testsuite/gcc.dg/c23-utf8str-type-2.c.jj        2024-08-14 
16:09:37.412342628 +0200
+++ gcc/testsuite/gcc.dg/c23-utf8str-type-2.c   2024-08-14 16:09:57.253106466 
+0200
@@ -0,0 +1,6 @@
+/* Test C23 UTF-8 string literal type.  */
+/* { dg-do compile } */
+/* { dg-options "-std=c23 -fno-char8_t" } */
+
+_Static_assert (_Generic (u8"text", char*: 1, default: 2) == 1, "UTF-8 string 
literals have an unexpected type");
+_Static_assert (_Generic (u8"x"[0], char:  1, default: 2) == 1, "UTF-8 string 
literal elements have an unexpected type");
--- gcc/testsuite/gcc.dg/c23-utf8char-4.c.jj    2024-08-14 16:02:52.517174585 
+0200
+++ gcc/testsuite/gcc.dg/c23-utf8char-4.c       2024-08-14 16:08:35.361081216 
+0200
@@ -0,0 +1,35 @@
+/* Test C23 UTF-8 characters.  Test valid usages.  */
+/* { dg-do compile } */
+/* { dg-options "-std=c23 -pedantic-errors -fno-char8_t" } */
+
+char a = u8'a';
+_Static_assert (u8'a' == 97);
+
+char b = u8'\0';
+_Static_assert (u8'\0' == 0);
+
+char c = u8'\xff';
+_Static_assert (u8'\xff' == (char) 255);
+
+char d = u8'\377';
+_Static_assert (u8'\377' == (char) 255);
+
+_Static_assert (sizeof (u8'a') == 1);
+_Static_assert (sizeof (u8'\0') == 1);
+_Static_assert (sizeof (u8'\xff') == 1);
+_Static_assert (sizeof (u8'\377') == 1);
+
+_Static_assert (_Generic (u8'a', char: 1, default: 2) == 1);
+_Static_assert (_Generic (u8'\0', char: 1, default: 2) == 1);
+_Static_assert (_Generic (u8'\xff', char: 1, default: 2) == 1);
+_Static_assert (_Generic (u8'\377', char: 1, default: 2) == 1);
+
+#ifdef __CHAR_UNSIGNED__
+#if u8'\0' - 1 < 0
+#error "UTF-8 constants not unsigned in preprocessor"
+#endif
+#else
+#if u8'\0' - 1 > 0
+#error "UTF-8 constants not signed in preprocessor"
+#endif
+#endif

        Jakub

Reply via email to